In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import re

import hvplot.pandas
import holoviews as hv

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_data = pd.read_csv("output/standings.csv")

# Display the data
df_data.head()

Unnamed: 0,year,school,conference,wins,losses,winning_pct,conf_wins,conf_losses,conf_winning_pct,ppg_offense,ppg_defense,SRS,SOS
0,2000,Florida State,ACC,11.0,2.0,0.846,8.0,0.0,1.0,39.3,10.5,23.13,5.59
1,2000,Clemson,ACC,9.0,3.0,0.75,6.0,2.0,0.75,34.7,21.1,11.54,2.04
2,2000,Georgia Tech,ACC,9.0,3.0,0.75,6.0,2.0,0.75,32.2,19.8,11.3,1.21
3,2000,Virginia,ACC,6.0,6.0,0.5,5.0,3.0,0.625,20.2,24.3,0.1,3.18
4,2000,North Carolina State,ACC,8.0,4.0,0.667,4.0,4.0,0.5,31.6,28.2,3.92,-0.33


In [3]:
# Check the dataset to determine initial processing steps
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2835 entries, 0 to 2834
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   year              2835 non-null   int64  
 1   school            2835 non-null   object 
 2   conference        2835 non-null   object 
 3   wins              2835 non-null   float64
 4   losses            2835 non-null   float64
 5   winning_pct       2835 non-null   float64
 6   conf_wins         2724 non-null   float64
 7   conf_losses       2724 non-null   float64
 8   conf_winning_pct  2724 non-null   float64
 9   ppg_offense       2835 non-null   float64
 10  ppg_defense       2835 non-null   float64
 11  SRS               2835 non-null   float64
 12  SOS               2835 non-null   float64
dtypes: float64(10), int64(1), object(2)
memory usage: 288.1+ KB


# Data Preprocessing

In [4]:
# Fill all the non-null values with zero
df_data = df_data.fillna(0)

# Create a UID of year and school (handle LA-Monroe and UNLV formatting)
df_data['team'] = '(' + df_data['school'] + ')' + '-' + df_data['year'].astype(str)
df_data['team'] = df_data['team'].str.replace('Louisiana-Monroe', "Louisiana Monroe")
df_data['team'] = df_data['team'].str.replace('Nevada-Las Vegas', "UNLV")

# Set the index to the UID
df_data = df_data.set_index('team')

# Drop the unnecessary columns and set the team column to the index
columns_to_drop = ['year', 'school', 'conference']
df_data = df_data.drop(columns_to_drop, axis=1)

# Use the `StandardScaler()` module from scikit-learn to normalize the data
data_scaled = StandardScaler().fit_transform(df_data)

# Creating a DataFrame with with the scaled data
df_scaled = pd.DataFrame(data_scaled, columns=list(df_data.columns))

# Set the index to the team value
df_scaled['team'] = df_data.index
df_scaled = df_scaled.set_index('team')

# Determine k and Model Dataset

In [5]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for loop where each value of k is evaluated using the K-means algorithm
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(df_scaled)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow_data = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow_data.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)



In [6]:
# Input value for k from Elbox Curve
k = 3

# Initialize the K-Means model
model = KMeans(n_clusters=int(k), random_state=1)

# Fit the K-Means model using the scaled data
model.fit(df_scaled)

# Predict the clusters to group
groups = model.predict(df_scaled)

# Create a copy of the Dataframe and update with the predicted groups
df_predictions = df_scaled.copy()
df_predictions['group'] = groups



# Generate User Input Graphs

#### Available input values: 
* wins
* losses
* winning_pct
* ppg_offense
* ppg_defense
* SRS
* SOS

In [7]:
# Generate the Variables
x = input('Select your independent variable (x axis value) from available inputs list')
y = input('Select your dependent variable (y axis value) from available inputs list')
team = input('Select the team to analyze')
title = f'Clustering Analysis for {team} comparing {y} vs {x}'

# Generate the main scatter plot
scatter_plot = df_predictions.reset_index().hvplot.scatter(
    x=x,
    y=y,
    by="group",
    hover_cols="team"
).opts(title=title, yformatter="%.0f", width=750, height=500)

# Generate the filtered plot
filter_condition = df_predictions.index.str.startswith(f'({team})-')

# Generate a scatter plot with stars for selected team
filtered_points = df_predictions.loc[filter_condition].reset_index().hvplot.scatter(
    x=x,
    y=y,
    by="group",
    hover_cols="team",
    marker='s',
    color='black',
    size=50
).opts(show_legend=False, width=750, height=500)

# Overlay the two plots to show the original scatter plot and the filtered points with stars
final_plot = scatter_plot * filtered_points

# Render the final plot
final_plot

Select your independent variable (x axis value) from available inputs list SOS
Select your dependent variable (y axis value) from available inputs list SRS
Select the team to analyze Nebraska


# Export Center

In [8]:
# Determine the frequency of teams in each group
df_team_count = df_predictions.copy().reset_index()

# Splitting the 'team' column into 'school' and 'season' and dropping scaled data
df_team_count = df_team_count['team'].str.split('-', expand=True)

# Renaming columns using the rename method and fill in cluster group values
df_team_count = df_team_count.rename(columns={0: 'team', 1: 'year', 2: 'group'})
df_team_count['team'] = df_team_count['team'].str.replace(r"\(", "", regex=True)
df_team_count['team'] = df_team_count['team'].str.replace(r"\)", "", regex=True)
df_team_count['group'] = list(df_predictions['group'])

# CONCAT original dataset with cluster values
df1 = df_data.copy().reset_index().drop('team', axis=1)
df_output = pd.concat([df_team_count, df1], axis=1)

# Output file
df_output.to_csv('output/clustering_analysis.csv', index=False)