<h1 style="color: #006400;">Creating a Machine Learning Model</h1>
<h3 style="color: #8b5e3c;">Applying machine learning to create labels for our data</h3>
<p style="color: #8b5e3c;">Mathias Galvan</p>

<h2 style="color: #006400;"> Loading the Data </h2>

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans, Birch
from sklearn.pipeline import Pipeline

acc_game_seat_2023 = pd.read_csv(r'/Users/galvanm/python/BucksHackathon25/BucksDatasets/ALGLSL_2023.csv')
acc_game_seat_2024 = pd.read_csv(r'/Users/galvanm/python/BucksHackathon25/BucksDatasets/ALGLSL_2024.csv')

acc_game_seat_2023.head()

In [None]:
acc_game_seat_2024.info()

In [None]:
acc_game_seat_2024.describe()

In [None]:
acc_game_seat_2024['GiveawayLabel'].astype('category')
acc_game_seat_2024['STM'].astype('category')
acc_game_seat_2024['SocialMediaEngagement'].astype('category')
acc_game_seat_2024['GameTier'].astype('category')

acc_game_seat_2023['GiveawayLabel'].astype('category')
acc_game_seat_2023['STM'].astype('category')
acc_game_seat_2023['SocialMediaEngagement'].astype('category')
acc_game_seat_2023['GameTier'].astype('category')

In [None]:
num_features = [
    'BasketballPropensity',
    'DistanceToArena',
    'AvgSpend',
    'GamesAttended',
    'SingleGameTickets',
    'PartialPlanTickets',
    'GroupTickets'
]
cat_features = [
    'GiveawayLabel',
    'STM',
    'SocialMediaEngagement',
    'GameTier'
]
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])
reducer = TruncatedSVD(n_components=10, random_state=42)


In [None]:
mbk_pipeline = Pipeline([
    ('prep', preprocessor),
    ('svd', reducer),
    ('cluster', MiniBatchKMeans(
        n_clusters=8,         
        batch_size=10000,     
        random_state=42
    ))
])

In [None]:
birch_pipeline = Pipeline([
    ('prep', preprocessor),
    ('svd', reducer),
    ('cluster', Birch(
        n_clusters=8,         
        threshold=0.5         
    ))
])

In [None]:
full_df = pd.concat([acc_game_seat_2023, acc_game_seat_2024], ignore_index=True)
X = full_df[num_features + cat_features]


In [None]:
mbk_pipeline.fit(X)
labels_mbk = mbk_pipeline.predict(X)
full_df['cluster_mbk'] = labels_mbk


In [None]:
birch_pipeline.fit(X)
labels_birch = birch_pipeline.predict(X)
full_df['cluster_birch'] = labels_birch



In [None]:
#view clusters
print(full_df['cluster_mbk'].unique())
print(full_df['cluster_birch'].unique())

In [None]:
import matplotlib.pyplot as plt

for feat in num_features:
    plt.figure()
    full_df.boxplot(column=feat, by='cluster_mbk')
    plt.title(feat)
    plt.xlabel('Cluster')
    plt.ylabel(feat)
    plt.show()

In [None]:
for feat in num_features:
    full_df.boxplot(column=feat, by='cluster_birch')
    plt.title(feat)
    plt.xlabel('Cluster')
    plt.ylabel(feat)
    plt.show()

In [None]:
birch_pipeline.fit(X)
cluster_labels = birch_pipeline.named_steps['cluster'].labels_

viz_pipeline = Pipeline([
    ('prep', preprocessor),
    ('svd2', TruncatedSVD(n_components=2, random_state=42))
])
coords_2d = viz_pipeline.fit_transform(X)


In [None]:
plt.figure(figsize=(8,6))
scatter = plt.scatter(
    coords_2d[:,0],
    coords_2d[:,1],
    c=cluster_labels,
    cmap='tab10',
    s=30,
    alpha=0.7
)
plt.title('Birch Clusters with TruncatedSVD')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend(
    *scatter.legend_elements(),
    title='Cluster'
)
sns.set_style( {'figure.facecolor': '#EEE1C6'})

plt.tight_layout()
plt.show()