<h1 style="color: #006400;">Data Imputations</h1>
<h3 style="color: #8b5e3c;">Filling in missing data using mean, median, and mode</h3>

<p style="color: #8b5e3c;">Mathias Galvan</p>
<h2 style="color: #006400;"> Loading the Data </h2>

In [None]:
import pandas as pd

account = pd.read_csv(r"C:/Users/galvanm/python/BucksHackathon25/BucksDatasets/AccountLevel.csv")
seat = pd.read_csv(r"C:/Users/galvanm/python/BucksHackathon25/BucksDatasets/SeatLevel.csv")
game = pd.read_csv(r"C:/Users/galvanm/python/BucksHackathon25/BucksDatasets/GameLevel.csv")

account['Season'].astype('category')
account['STM'].astype('category')
account.head()

<h3>Checking which features contain NaN</h3>

In [None]:

for i in account.columns:
    has_nans = account[i].isna().values.any()
    print(f"Nan in AccountLevel[{i}]: {has_nans}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cols = ['SingleGameTickets', 'PartialPlanTickets', 'GroupTickets', 'AvgSpend', 'GamesAttended', 'DistanceToArena', 'BasketballPropensity']
# setting the number of rows and columns
rows = 3
columns = 4

# creating the figure
fig, axes = plt.subplots(rows, columns, figsize=(20, 12)) 
axes = axes.flatten()

for ax, col in zip(axes, cols):
    #plt.figure(figsize=(6,4))
    sns.boxplot(x= account[col], ax=ax)
    ax.set_title(f'Boxplot of {col}')
    ax.set_xlabel(col)

for ax in axes[len(cols):]:
    fig.delaxes(ax)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np

col_r_sampling = ['SingleGameTickets', 'PartialPlanTickets', 'GroupTickets', 'AvgSpend', 'GamesAttended', 'DistanceToArena']

nan_rows = account[account.isna().any(axis=1)]
print(nan_rows)

for i in col_r_sampling:
    non_missing = account[i].dropna().values
    account.loc[account[i].isna(), i] = np.random.choice(non_missing, size=account[i].isna().sum())

account.iloc[70]


In [None]:
median_bp = account['BasketballPropensity'].median()
account['BasketballPropensity'].fillna(median_bp, inplace=True)

for i in account.columns:
    has_nans = account[i].isna().values.any()
    print(f"Nan in AccountLevel[{i}]: {has_nans}")

### Relabeling FanSegment using KNN

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def impute_fan_segment(df):
    num_features = ['GamesAttended', 'DistanceToArena', 'BasketballPropensity', 'STM', 'AvgSpend']
    cat_features = ['SocialMediaEngagement']

    labeled = df['FanSegment'] != 'Limited Data'
    labeled_df = df[labeled].copy()
    unlabeled_df = df[~labeled].copy()

    X_train = labeled_df[num_features + cat_features]
    y_train = labeled_df['FanSegment']

    le = LabelEncoder()
    y_enc = le.fit_transform(y_train)

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    pipeline = Pipeline([
        ('pre', preprocessor),
        ('knn', KNeighborsClassifier())
    ])

    pipeline.fit(X_train, y_enc)

    X_unlab = unlabeled_df[num_features + cat_features]
    y_pred = pipeline.predict(X_unlab)
    unlabeled_df['FanSegment'] = le.inverse_transform(y_pred)

    df_imputed = pd.concat([labeled_df, unlabeled_df]).sort_index()
    return df_imputed

In [None]:
account = impute_fan_segment(account)

### Adding Giveaway Labels (0 or 1)

In [None]:
def giveaway_label(x):
    if pd.isna(x):
        return 0
    else:
        return 1


In [None]:
game['GiveawayLabel'] = game['Giveaway'].apply(giveaway_label)

### Overwriting Datasets with Imputations

In [None]:
account.to_csv(r'/Users/galvanm/python/BucksHackathon25/BucksDatasets/AccountLevel.csv')
seat.to_csv(r'/Users/galvanm/python/BucksHackathon25/BucksDatasets/SeatLevel.csv')
game.to_csv(r'/Users/galvanm/python/BucksHackathon25/BucksDatasets/GameLevel.csv')
