# VAR Fairness Audit: Complete Analysis

**DS 112 Final Project**

This notebook provides a complete solution for analyzing VAR decisions in Google Colab.

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn plotly scikit-learn scipy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)

## Complete VAR Fairness Audit

This notebook combines all elements of the VAR fairness audit into a single comprehensive analysis.

## 1. Data Loading and Preparation

In [None]:
# Load VAR incident data
try:
    var_incidents = pd.read_csv('VAR_Incidents_Stats.csv')
    team_stats = pd.read_csv('VAR_Team_Stats.csv')
    print("Loaded raw datasets successfully")
except FileNotFoundError:
    print("Raw data files not found. Using the combined dataset if available.")

# Try to load the combined dataset
try:
    df = pd.read_csv('var_combined.csv')
    print("Loaded combined dataset successfully")
except FileNotFoundError:
    print("Creating combined dataset...")
    # Merge datasets
    df = pd.merge(var_incidents, team_stats, on='team_name', how='left')
    df.to_csv('var_combined.csv', index=False)
    print("Created and saved combined dataset")

# Display dataset info
print("\nDataset Information:")
df.info()

print("\nSample Data:")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Overview of VAR decisions
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
df['decision_type'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)
plt.title('Distribution of VAR Decisions')
plt.ylabel('')

# Decision outcomes by team tier
plt.subplot(1, 2, 2)
# Create team tiers based on ranking
df['team_tier'] = pd.qcut(df['team_rank'], q=4, labels=['Top Tier', 'Upper Mid', 'Lower Mid', 'Bottom Tier'])
# Plot favorable decisions by team tier
sns.countplot(x='team_tier', hue='decision_favorable', data=df)
plt.title('Favorable Decisions by Team Tier')
plt.xlabel('Team Tier')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Statistical Analysis

In [None]:
# Import statistical libraries
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Chi-square test of independence
contingency = pd.crosstab(df['team_tier'], df['decision_favorable'])
chi2, p, dof, expected = stats.chi2_contingency(contingency)
print(f"Chi-square Test Results:")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# Logistic regression
X = df[['team_rank', 'market_value', 'avg_attendance']]
y = df['decision_favorable']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model evaluation
print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred))

# Feature importance
coefs = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
})
coefs = coefs.sort_values('Coefficient', ascending=False)
print("\nFeature Importance:")
print(coefs)

## 4. LLM Analysis

For a complete analysis, we also need to incorporate language model analysis of incident descriptions.

Note: To run this section, you need to add your Gemini API key to Colab secrets.

In [None]:
# Install and import Google Generative AI library
try:
    !pip install google-generativeai
    import google.generativeai as genai
    print("Google Generative AI library installed")
except:
    print("Could not install Google Generative AI library")

In [None]:
# Configure the API (only if you want to run LLM analysis)
try:
    from google.colab import userdata
    api_key = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=api_key)
    print("API configured successfully")
    
    # Sample function for incident analysis
    def analyze_incident(incident_text, decision_type):
        model = genai.GenerativeModel('gemini-pro')
        prompt = f"""Analyze this soccer VAR incident objectively:\n\nIncident: {incident_text}\nDecision: {decision_type}\n\nEvaluate: Was this decision justified? Rate controversy (1-10)."""
        response = model.generate_content(prompt)
        return response.text
    
    # Analyze a sample incident
    sample = df.iloc[0]
    print(f"\nSample Incident: {sample['incident_description']}")
    print(f"Decision: {sample['decision_type']}")
    print("\nAnalysis:")
    print(analyze_incident(sample['incident_description'], sample['decision_type']))
    
except Exception as e:
    print(f"To run LLM analysis, add your API key to Colab secrets: {e}")

## 5. Conclusions

Summarize the key findings from your VAR fairness audit analysis.

In [None]:
# Visualize the key findings
plt.figure(figsize=(10, 6))

# Plot feature importance for predicting favorable decisions
sns.barplot(x='Coefficient', y='Feature', data=coefs)
plt.title('Factors Influencing Favorable VAR Decisions')
plt.axvline(x=0, color='black', linestyle='--')
plt.tight_layout()
plt.show()

# Print conclusion
print("VAR Fairness Audit Conclusions:")
print("1. Statistical significance: The chi-square test p-value indicates whether team tier and decision favorability are independent.")
print(f"   p-value: {p:.4f} - {'Evidence of bias' if p < 0.05 else 'No strong evidence of bias'}")
print("\n2. Predictive modeling: We examined if team characteristics can predict favorable decisions.")
print(f"   The most influential factors are: {', '.join(coefs['Feature'].head(2).tolist())}")
print("\n3. Recommendations: Based on the analysis, VAR implementation could be improved by...")