In [9]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load dataset
file_path = "AccountLevelFinal.csv"  # Update this with your actual file path
df = pd.read_csv(file_path)

In [10]:
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Remove the target variable from independent features
features = [col for col in numeric_columns if col != 'GiveawayFraction']

X = df[features]
y = df['GiveawayFraction']

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Compute VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display features with high VIF
print(vif_data.sort_values(by="VIF", ascending=False))


  vif = 1. / (1. - r_squared_i)


                  Feature        VIF
16          TierAFraction        inf
28  TierCDWeekendFraction        inf
27  TierABWeekendFraction        inf
26   TierDWeekendFraction        inf
25   TierCWeekendFraction        inf
24   TierBWeekendFraction        inf
23   TierAWeekendFraction        inf
21         TierCDFraction        inf
20         TierABFraction        inf
19          TierDFraction        inf
18          TierCFraction        inf
17          TierBFraction        inf
22        WeekendFraction  15.589354
29                Cluster  10.803616
15                  TierD   7.991956
14                  TierC   7.959717
10    UniqueGamesAttended   4.077305
11  TotalTicketsPurchased   3.817376
0                  Season   3.381383
13                  TierB   3.155921
1           AccountNumber   3.098684
3      PartialPlanTickets   2.261490
12                  TierA   2.215849
4            GroupTickets   1.809301
6           GamesAttended   1.792814
2       SingleGameTickets   1.217327
5

In [12]:
# Split into training & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [13]:
# Get coefficients
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
coefficients = coefficients.sort_values(by="Coefficient", ascending=False)


In [14]:
# Model evaluation
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared:.4f}")

R-squared value: 0.4289


In [15]:
# Display feature importance
print("\nFeature Coefficients:")
print(coefficients)


Feature Coefficients:
                  Feature   Coefficient
22        WeekendFraction  9.970563e-01
10    UniqueGamesAttended  5.170548e-01
29                Cluster  5.016726e-01
13                  TierB  4.476796e-01
21         TierCDFraction  3.121417e-01
27  TierABWeekendFraction  2.511637e-01
19          TierDFraction  2.217331e-01
23   TierAWeekendFraction  1.382658e-01
24   TierBWeekendFraction  1.128979e-01
18          TierCFraction  9.040862e-02
15                  TierD  8.288812e-02
28  TierCDWeekendFraction  7.746678e-02
25   TierCWeekendFraction  5.977459e-02
11  TotalTicketsPurchased  3.913894e-02
26   TierDWeekendFraction  1.769219e-02
7         DistanceToArena  7.063320e-03
6           GamesAttended  2.009795e-03
1           AccountNumber -5.469198e-08
3      PartialPlanTickets -2.144799e-04
4            GroupTickets -3.782723e-04
2       SingleGameTickets -5.690512e-04
9   SocialMediaEngagement -7.483338e-03
8    BasketballPropensity -2.119865e-02
5                

In [16]:
# Run OLS Regression for detailed statistical analysis
X_const = sm.add_constant(X)  # Add intercept
ols_model = sm.OLS(y, X_const).fit()
print("\nOLS Regression Results:\n", ols_model.summary())


OLS Regression Results:
                             OLS Regression Results                            
Dep. Variable:       GiveawayFraction   R-squared:                       0.433
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     1284.
Date:                Thu, 27 Feb 2025   Prob (F-statistic):               0.00
Time:                        00:15:32   Log-Likelihood:                -8693.1
No. Observations:               42016   AIC:                         1.744e+04
Df Residuals:                   41990   BIC:                         1.766e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
cons

In [18]:

# Manually drop features with VIF = inf
drop_features = ["TierAFraction",
"TierCDWeekendFraction",
"TierABWeekendFraction",
"TierDWeekendFraction",
"TierCWeekendFraction",
"TierBWeekendFraction",
"TierAWeekendFraction",
"TierCDFraction",
"TierABFraction",
"TierDFraction",
"TierCFraction",
"TierBFraction" 
]

df = df.drop(columns=drop_features, errors='ignore')

# Select only numeric features
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

# Define X and y
X = df[numeric_columns]
X = X.drop(columns=['GiveawayFraction'], errors='ignore')  # Drop target variable
y = df['GiveawayFraction']

# Compute VIF after removing highly correlated features
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data.sort_values(by="VIF", ascending=False)

# Recalculate VIF
vif_data = calculate_vif(X)
print("\nUpdated VIF values:")
print(vif_data)

# Drop features with VIF > 10 iteratively
while vif_data["VIF"].max() > 10:
    feature_to_drop = vif_data.iloc[0]["Feature"]
    print(f"Dropping {feature_to_drop} due to high VIF...")
    X = X.drop(columns=[feature_to_drop])
    vif_data = calculate_vif(X)
    print("\nUpdated VIF values:")
    print(vif_data)

# Train the regression model after feature selection
X_const = sm.add_constant(X)  # Add intercept
ols_model = sm.OLS(y, X_const).fit()
print("\nFinal OLS Regression Results:\n", ols_model.summary())



Updated VIF values:
                  Feature        VIF
0                  Season  59.526120
17                Cluster  17.702446
16        WeekendFraction   9.428519
7         DistanceToArena   8.226320
8    BasketballPropensity   8.134485
9   SocialMediaEngagement   7.323334
11  TotalTicketsPurchased   6.515700
15                  TierD   5.547765
14                  TierC   4.790507
1           AccountNumber   3.941437
10    UniqueGamesAttended   3.427489
5                AvgSpend   3.048511
6           GamesAttended   2.928986
13                  TierB   2.730521
2       SingleGameTickets   2.220603
3      PartialPlanTickets   2.051215
12                  TierA   1.881366
4            GroupTickets   1.749835
Dropping Season due to high VIF...

Updated VIF values:
                  Feature       VIF
16                Cluster  8.096407
7    BasketballPropensity  7.469368
6         DistanceToArena  7.340971
8   SocialMediaEngagement  6.526997
10  TotalTicketsPurchased  6.091211
15  

In [26]:
import pandas as pd
import numpy as np

# Function to return bonus based on fan segment
def get_fan_segment_bonus(segment):
    bonuses = {"G": 1.0, "F": 0.8, "E": 0.4, "C": -0.1, "D": -0.05}
    return bonuses.get(segment, 0)

# Function to compute the PromoPlan_Score for each row
def calculate_promo_plan_score(row):
    score = (
        0.60 * row["GiveawayFraction"]
        + 0.15 * row["SocialMediaEngagement"]
        + 0.15 * get_fan_segment_bonus(row["FanSegment"])
        + 0.03 * row["UniqueGamesAttended"]
        + 0.0006 * row["AvgSpend"]
    )
    return score

# Load the CSV file into a DataFrame
df = pd.read_csv("accountlevelfinal.csv")

# Calculate the raw promo plan scores and add them as a new column
df["PromoPlan_Score"] = df.apply(calculate_promo_plan_score, axis=1)

# Normalize the scores so that the minimum is 0 and the maximum is 1
min_score = df["PromoPlan_Score"].min()
max_score = df["PromoPlan_Score"].max()

if max_score - min_score != 0:
    df["PromoPlan_Score"] = (df["PromoPlan_Score"] - min_score) / (max_score - min_score)
else:
    # If all scores are the same, set them all to 0 (or adjust as needed)
    df["PromoPlan_Score"] = 0.0

# Optional: Display the relevant columns to verify the scores
columns_to_display = ["GiveawayFraction", "FanSegment", "SocialMediaEngagement", "PromoPlan_Score"]
print(df[columns_to_display].head())

# Write the updated DataFrame back to the CSV file (or a new CSV if preferred)
df.to_csv("accountlevelfinal.csv", index=False)


   GiveawayFraction FanSegment  SocialMediaEngagement  PromoPlan_Score
0               1.0          F                    0.3         0.844934
1               0.0          A                    0.3         0.017633
2               0.0          B                    0.3         0.017601
3               0.0          C                    1.0         0.120698
4               0.0          A                    0.6         0.068964


In [28]:
# Define a bonus function unique for the Value Plan
def get_fan_segment_bonus_value(segment):
    bonuses = {"D": 1, "G": -0.5, "F": -0.5}
    return bonuses.get(segment, 0)

# Define the function to calculate the ValuePlan_Score for each row
def calculate_value_plan_score(row):
    bonus = get_fan_segment_bonus_value(row["FanSegment"])
    score = (
        0.25 * (1 - row["AvgSpend"]) +
        0.25 * row["TierCDWeekendFraction"] +
        0.15 * (1 - row["DistanceToArena"]) +
        0.15 * (1 - row["UniqueGamesAttended"]) +
        0.20 * bonus
    )
    return score

# Calculate the raw ValuePlan_Score and add it as a new column
df["ValuePlan_Score"] = df.apply(calculate_value_plan_score, axis=1)

# Normalize the ValuePlan_Score so that the minimum is 0 and the maximum is 1
min_value = df["ValuePlan_Score"].min()
max_value = df["ValuePlan_Score"].max()

if max_value - min_value != 0:
    df["ValuePlan_Score"] = (df["ValuePlan_Score"] - min_value) / (max_value - min_value)
else:
    df["ValuePlan_Score"] = 0.0

# Optional: Display relevant columns to verify the new score
columns_to_display = ["TierCDWeekendFraction", "AvgSpend", "DistanceToArena", "UniqueGamesAttended", "FanSegment", "ValuePlan_Score"]
print(df[columns_to_display].head())

# Save the updated DataFrame back to the CSV file
df.to_csv("accountlevelfinal.csv", index=False)


   TierCDWeekendFraction  AvgSpend  DistanceToArena  UniqueGamesAttended  \
0                    0.0  1.000000         0.381944                  0.0   
1                    0.0  0.590331         0.462963                  0.0   
2                    0.0  0.544529         0.368056                  0.0   
3                    0.0  0.137405         0.361111                  0.0   
4                    0.0  0.071247         0.363426                  0.0   

  FanSegment  ValuePlan_Score  
0          F         0.088370  
1          A         0.256156  
2          B         0.278808  
3          C         0.369483  
4          A         0.383762  


In [30]:
# Define a bonus function unique for the Marquee Plan
def get_fan_segment_bonus_marquee(segment):
    # Bonus mapping: 1 for G, 0.75 for F, 0.4 for C, 0.2 for E; default 0 for others
    bonuses = {"G": 1.0, "F": 0.75, "C": 0.4, "E": 0.2}
    return bonuses.get(segment, 0)

# Helper function to normalize a pandas Series using min-max normalization
def normalize_column(series):
    min_val = series.min()
    max_val = series.max()
    if max_val - min_val != 0:
        return (series - min_val) / (max_val - min_val)
    else:
        return series - min_val  # Returns all zeros if constant

# Normalize 'AvgSpend' and 'BasketballPropensity'
df["AvgSpend_norm"] = normalize_column(df["AvgSpend"])
df["BasketballPropensity_norm"] = normalize_column(df["BasketballPropensity"])

# Calculate the raw MarqueePlan_Score using the provided formula
df["MarqueePlan_Score"] = (
    0.30 * df["AvgSpend"] +
    0.30 * df["TierABFraction"] +
    0.20 * df["BasketballPropensity"] +
    0.20 * df["FanSegment"].apply(get_fan_segment_bonus_marquee)
)

# Normalize the MarqueePlan_Score so that its minimum is 0 and maximum is 1
min_marquee = df["MarqueePlan_Score"].min()
max_marquee = df["MarqueePlan_Score"].max()
if max_marquee - min_marquee != 0:
    df["MarqueePlan_Score"] = (df["MarqueePlan_Score"] - min_marquee) / (max_marquee - min_marquee)
else:
    df["MarqueePlan_Score"] = 0.0

# Display relevant columns to verify the calculations
columns_to_display = [
    "AvgSpend",
    "TierABFraction",
    "BasketballPropensity", "BasketballPropensity",
    "FanSegment", "MarqueePlan_Score"
]
print(df[columns_to_display].head())

# Save the updated DataFrame back to the CSV file
df.to_csv("accountlevelfinal.csv", index=False)


   AvgSpend  TierABFraction  BasketballPropensity  BasketballPropensity  \
0  1.000000             0.0              0.860599              0.860599   
1  0.590331             1.0              0.414747              0.414747   
2  0.544529             0.0              0.888249              0.888249   
3  0.137405             0.0              0.394009              0.394009   
4  0.071247             0.0              0.526498              0.526498   

  FanSegment  MarqueePlan_Score  
0          F           0.619137  
1          A           0.555686  
2          B           0.331776  
3          C           0.187657  
4          A           0.112677  


In [31]:
# Define a bonus function unique for the Weekend Plan
def get_fan_segment_bonus_weekend(segment):
    # Bonus mapping for the Weekend Plan:
    # Adjust these values as needed if your intended mapping differs.
    bonuses = {"B": 0.8, "E": 1.0, "G": 0.5, "F": 0.4}
    return bonuses.get(segment, 0)

# Calculate the raw WeekendPlan_Score using the provided formula
df["WeekendPlan_Score"] = (
    0.60 * df["WeekendFraction"] +
    0.10 * df["DistanceToArena"] +
    0.10 * df["AvgSpend"] +
    0.20 * df["FanSegment"].apply(get_fan_segment_bonus_weekend)
)

# Normalize the WeekendPlan_Score so that its minimum is 0 and maximum is 1
min_weekend = df["WeekendPlan_Score"].min()
max_weekend = df["WeekendPlan_Score"].max()
if max_weekend - min_weekend != 0:
    df["WeekendPlan_Score"] = (df["WeekendPlan_Score"] - min_weekend) / (max_weekend - min_weekend)
else:
    df["WeekendPlan_Score"] = 0.0

# Optional: Display relevant columns to verify the calculations
columns_to_display = [
    "WeekendFraction",
    "DistanceToArena",
    "AvgSpend",
    "FanSegment",
    "WeekendPlan_Score"
]
print(df[columns_to_display].head())

# Save the updated DataFrame back to the CSV file
df.to_csv("accountlevelfinal.csv", index=False)


   WeekendFraction  DistanceToArena  AvgSpend FanSegment  WeekendPlan_Score
0              0.0         0.381944  1.000000          F           0.205675
1              0.0         0.462963  0.590331          A           0.091003
2              0.0         0.368056  0.544529          B           0.239269
3              0.0         0.361111  0.137405          C           0.034637
4              0.0         0.363426  0.071247          A           0.028150


In [34]:
# Define a function to assign the scoring plan based on the highest score
def assign_scoring_plan(row):
    scores = {
        "ValuePlan": row["ValuePlan_Score"],
        "PromoPlan": row["PromoPlan_Score"],
        "MarqueePlan": row["MarqueePlan_Score"],
        "WeekendPlan": row["WeekendPlan_Score"]
    }
    # Find the key with the maximum score
    return max(scores, key=scores.get)

# Create the new Scoring_Plan column using the function above
df["Scoring_Plan"] = df.apply(assign_scoring_plan, axis=1)

# Optional: Display a few rows to verify the new column
print(df[["ValuePlan_Score", "PromoPlan_Score", "MarqueePlan_Score", "WeekendPlan_Score", "Scoring_Plan"]].head())

# Save the updated DataFrame back to the CSV file
df.to_csv("accountlevelfinal.csv", index=False)


   ValuePlan_Score  PromoPlan_Score  MarqueePlan_Score  WeekendPlan_Score  \
0         0.088370         0.844934           0.619137           0.205675   
1         0.256156         0.017633           0.555686           0.091003   
2         0.278808         0.017601           0.331776           0.239269   
3         0.369483         0.120698           0.187657           0.034637   
4         0.383762         0.068964           0.112677           0.028150   

  Scoring_Plan  
0    PromoPlan  
1  MarqueePlan  
2  MarqueePlan  
3    ValuePlan  
4    ValuePlan  


In [33]:
# Define the columns to remove
columns_to_remove = ["BasketballPropensity_norm", "AvgSpend_norm", "PromoPlanScore"]

# Drop the specified columns from the DataFrame, ignoring errors if a column is not found
df = df.drop(columns=columns_to_remove, errors='ignore')

# Optional: Display the remaining columns to verify
print("Remaining columns in the dataset:")
print(df.columns.tolist())

# Save the updated DataFrame back to the CSV file
df.to_csv("accountlevelfinal.csv", index=False)


Remaining columns in the dataset:
['Season', 'AccountNumber', 'SingleGameTickets', 'PartialPlanTickets', 'GroupTickets', 'AvgSpend', 'GamesAttended', 'FanSegment', 'DistanceToArena', 'BasketballPropensity', 'SocialMediaEngagement', 'UniqueGamesAttended', 'TotalTicketsPurchased', 'TierA', 'TierB', 'TierC', 'TierD', 'TierAFraction', 'TierBFraction', 'TierCFraction', 'TierDFraction', 'TierABFraction', 'TierCDFraction', 'FavoriteTier', 'WeekendFraction', 'TierAWeekendFraction', 'TierBWeekendFraction', 'TierCWeekendFraction', 'TierDWeekendFraction', 'TierABWeekendFraction', 'TierCDWeekendFraction', 'GiveawayFraction', 'Cluster', 'Plan (5)', 'PromoPlan_Score', 'ValuePlan_Score', 'MarqueePlan_Score', 'WeekendPlan_Score', 'Scoring_Plan']
