# Clustering

In [12]:
# Core Packages
import os
import pandas as pd
os.chdir("/Users/erikgregorywebb/Documents/Python/nba-prediction/Clustering/Data")

# Packages for K-Means
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import preprocessing
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score
from collections import Counter

In [13]:
df_12_13 = pd.read_csv("df_12_13.csv")
df_13_14 = pd.read_csv("df_13_14.csv")
df_14_15 = pd.read_csv("df_14_15.csv")
df_15_16 = pd.read_csv("df_15_16.csv")
df_16_17 = pd.read_csv("df_16_17.csv")

In [14]:
def find_majority(votes):
    vote_count = Counter(votes)
    top_two = vote_count.most_common(2)
    if len(top_two)>1 and top_two[0][1] == top_two[1][1]:
        # It is a tie
        return top_two[0][0]
    return top_two[0][0]

def get_clusters(dataframe, number_of_clusters):
    # Prepare the data
    Team_Name = dataframe["Team"]
    Opponent_Name = dataframe["Opponent"]
    dataframe = dataframe.drop(['FG%', '3P%', 'FT%', 'TS%', 'eFG%', 'ORB%', 'DRB%', 'BLK%', 'TOV%', 'ORtg', 'Team-Score', 'Opponent-Score', 'Win', 'Season', 'Location', 'Date', 'Time', 'Team', 'Opponent'], axis=1)
    
    # Create the clusters
    kmeans_model = KMeans(n_clusters = number_of_clusters, n_init = 5, n_jobs = 1).fit(dataframe)
    labels = kmeans_model.labels_
    
    # Create the dataframe
    dataframe['Clusters'] = labels
    teams = Team_Name.unique()
    clusters = []
    for team in teams:
        clusters.append(find_majority(dataframe.iloc[Team_Name[Team_Name==team].index]['Clusters']))
    
    cluster_df = pd.DataFrame(list(map(list, zip(teams,clusters))), columns = ['Team', 'Cluster'])
    cluster_df = cluster_df.sort_values(by='Cluster', ascending = True)
    cluster_df = cluster_df.reset_index(drop=True)
    
    return cluster_df

In [15]:
dataframes = [df_12_13, df_13_14, df_14_15, df_15_16, df_16_17]
data = pd.DataFrame(columns = ['Team', 'Cluster'])
year = 2012

for dataframe in dataframes:
    temp = get_clusters(dataframe, 8)
    temp["Year"] = str(year)
    year = year + 1
    data= data.append(temp, ignore_index=True)

In [16]:
data[(data.Cluster == 3) & (data.Year == "2015")]

Unnamed: 0,Cluster,Team,Year
104,3,Washington Wizards,2015
105,3,Chicago Bulls,2015


In [17]:
# Cluster teams using ALL seasons
frames = [df_12_13, df_13_14, df_14_15, df_15_16, df_16_17]
all_seasons = pd.concat(frames)
get_clusters(all_seasons, 8).head()

Unnamed: 0,Team,Cluster
0,Brooklyn Nets,0
1,Denver Nuggets,0
2,Minnesota Timberwolves,2
3,Sacramento Kings,4
4,Cleveland Cavaliers,4
