In [30]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load dataset
file_path = "/Users/abhiaremanda/BUCKSHACKATHON/CleanedData/AccountLevelFinal.csv"  # Update this with your actual file path
df = pd.read_csv(file_path)


In [31]:
# Compute PromoGamesCount if it's not in the dataset
if 'PromoGamesCount' not in df.columns:
    df['PromoGamesCount'] = df["TotalTicketsPurchased"] * df['GiveawayFraction']

# Select only numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Remove the target variable from independent features
features = [col for col in numeric_columns if col != 'PromoGamesCount']

X = df[features]
y = df['PromoGamesCount']

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Compute VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display features with high VIF
print(vif_data.sort_values(by="VIF", ascending=False))


  vif = 1. / (1. - r_squared_i)


                          Feature        VIF
21       UniqueGamesAttended_norm        inf
33               Tier_AB_Fraction        inf
31                Tier_D_Fraction        inf
30                Tier_C_Fraction        inf
29                Tier_B_Fraction        inf
28                Tier_A_Fraction        inf
27                   Tier_D_Count        inf
26                   Tier_C_Count        inf
25                   Tier_B_Count        inf
24                   Tier_A_Count        inf
22      BasketballPropensity_norm        inf
35         TierA_Weekday_Fraction        inf
20             GamesAttended_norm        inf
19     UniqueGamesAttended_capped        inf
18           GamesAttended_capped        inf
17     TotalTicketsPurchased_norm        inf
16   TotalTicketsPurchased_capped        inf
15  TotNumTicketsPurchased_capped        inf
14           DistanceToArena_norm        inf
13         DistanceToArena_capped        inf
12                  AvgSpend_norm        inf
11        

In [4]:
# Split into training & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [5]:
# Get coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
coefficients = coefficients.sort_values(by="Coefficient", ascending=False)


In [6]:
# Model evaluation
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared:.4f}")

R-squared value: 0.6425


In [7]:
# Display feature importance
print("\nFeature Coefficients:")
print(coefficients)


Feature Coefficients:
                          Feature   Coefficient
41               GiveawayFraction  3.729410e+00
28                Tier_A_Fraction  3.942413e-01
30                Tier_C_Fraction  3.061438e-01
6                   GamesAttended  1.866576e-01
10          TotalTicketsPurchased  1.437174e-01
27                   Tier_D_Count  1.415991e-01
25                   Tier_B_Count  1.256823e-01
0                          Season  6.198523e-02
33               Tier_AB_Fraction  5.508045e-02
16   TotalTicketsPurchased_capped  4.465559e-02
15  TotNumTicketsPurchased_capped  4.465559e-02
9             UniqueGamesAttended  4.290750e-03
17     TotalTicketsPurchased_norm  2.977039e-03
11                AvgSpend_capped  9.622822e-04
13         DistanceToArena_capped  2.212115e-04
12                  AvgSpend_norm  4.897110e-06
14           DistanceToArena_norm  7.789136e-07
22      BasketballPropensity_norm -9.468870e-08
1                   AccountNumber -2.087606e-06
7                

In [8]:
# Run OLS Regression for detailed statistical analysis
X_const = sm.add_constant(X)  # Add intercept
ols_model = sm.OLS(y, X_const).fit()
print("\nOLS Regression Results:\n", ols_model.summary())


OLS Regression Results:
                             OLS Regression Results                            
Dep. Variable:        PromoGamesCount   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     9533.
Date:                Mon, 24 Feb 2025   Prob (F-statistic):               0.00
Time:                        16:18:05   Log-Likelihood:            -1.0031e+05
No. Observations:               42016   AIC:                         2.007e+05
Df Residuals:                   41986   BIC:                         2.009e+05
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------

In [16]:

# Manually drop features with VIF = inf
drop_features = [
    "UniqueGamesAttended_norm", "Tier_AB_Fraction", "Tier_CD_Fraction", 
    "Tier_A_Fraction", "Tier_B_Fraction", "Tier_C_Fraction", "Tier_D_Fraction",
    "TierA_Weekday_Fraction", "TierB_Weekday_Fraction", "TierC_Weekday_Fraction", "TierD_Weekday_Fraction",
    "TotalTicketsPurchased", "TotalTicketsPurchased_capped", "TotalTicketsPurchased_norm", "TotNumTicketsPurchased_capped",
    "AvgSpend_norm", "AvgSpend_capped", "DistanceToArena_norm", "DistanceToArena_capped",
    "BasketballPropensity", "BasketballPropensity_norm"
]

df = df.drop(columns=drop_features, errors='ignore')

# Select only numeric features
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Define X and y
X = df[numeric_columns]
X = X.drop(columns=['PromoGamesCount'], errors='ignore')  # Drop target variable
y = df['PromoGamesCount']

# Compute VIF after removing highly correlated features
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data.sort_values(by="VIF", ascending=False)

# Recalculate VIF
vif_data = calculate_vif(X)
print("\nUpdated VIF values:")
print(vif_data)

# Drop features with VIF > 10 iteratively
while vif_data["VIF"].max() > 10:
    feature_to_drop = vif_data.iloc[0]["Feature"]
    print(f"Dropping {feature_to_drop} due to high VIF...")
    X = X.drop(columns=[feature_to_drop])
    vif_data = calculate_vif(X)
    print("\nUpdated VIF values:")
    print(vif_data)

# Train the regression model after feature selection
X_const = sm.add_constant(X)  # Add intercept
ols_model = sm.OLS(y, X_const).fit()
print("\nFinal OLS Regression Results:\n", ols_model.summary())


  vif = 1. / (1. - r_squared_i)



Updated VIF values:
                       Feature        VIF
9         GamesAttended_capped        inf
11          GamesAttended_norm        inf
0                       Season  43.042754
10  UniqueGamesAttended_capped  29.432473
17             WeekendFraction  20.406092
19     TierCD_Weekday_Fraction  16.495675
8          UniqueGamesAttended  16.245161
6                GamesAttended   7.447876
12  SocialMediaEngagement_norm   7.324233
18     TierAB_Weekday_Fraction   5.664093
16                Tier_D_Count   5.229602
14                Tier_B_Count   4.364627
1                AccountNumber   3.918970
15                Tier_C_Count   3.355267
4                 GroupTickets   3.159256
3           PartialPlanTickets   3.077502
13                Tier_A_Count   2.443990
2            SingleGameTickets   2.056667
5                     AvgSpend   1.965003
20            GiveawayFraction   1.435797
7              DistanceToArena   1.235293
Dropping GamesAttended_capped due to high VIF...

Updat

In [33]:
import numpy as np

def get_fan_segment_bonus(segment):
    bonuses = {"G": 1.0, "F": 0.8, "E": 0.4}
    return bonuses.get(segment, 0) 

# Define a function to compute the PromoPlan_Score
def calculate_promo_plan_score(row):
    score = (
        .60 * row["GiveawayFraction"]
        + 0.25 * row["SocialMediaEngagement_norm"]
        + 0.15 * get_fan_segment_bonus(row["FanSegment"])
        + 0.03 * row["UniqueGamesAttended"]
        + 0.0006 * row["AvgSpend"]
        - 0.43 * row["TierCD_Weekday_Fraction"]
    )

    return score
    

# Apply the function to compute scores
df["PromoPlan_Score"] = df.apply(calculate_promo_plan_score, axis=1)

min_score = df["PromoPlan_Score"].min()
max_score = df["PromoPlan_Score"].max()

df["PromoPlan_Score"] = (df["PromoPlan_Score"] - min_score) / (max_score - min_score)


# Select and display only the relevant columns
columns_to_display = ["GiveawayFraction", "FanSegment", "SocialMediaEngagement_norm", "PromoPlan_Score"]
print(df[columns_to_display])


       GiveawayFraction    FanSegment  SocialMediaEngagement_norm  \
0                   1.0             F                         0.3   
1                   0.0             A                         0.3   
2                   0.0             B                         0.3   
3                   0.0             C                         1.0   
4                   0.0             A                         0.6   
...                 ...           ...                         ...   
42011               0.0             A                         0.6   
42012               0.0             A                         1.0   
42013               1.0             D                         0.6   
42014               0.0  Limited Data                         1.0   
42015               0.0             A                         1.0   

       PromoPlan_Score  
0             0.463950  
1             0.183542  
2             0.097179  
3             0.146944  
4             0.104702  
...                ..