In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler, normalize
from sklearn.manifold import Isomap
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.metrics import silhouette_score, completeness_score, homogeneity_score, v_measure_score
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('CFBeattendance.csv', encoding= 'unicode_escape')
print(df.describe())
print(df.info())
df.shape

          Attendance  Current Wins  Current Losses  Stadium Capacity  \
count    6672.000000   6672.000000     6672.000000       6672.000000   
mean    45311.522782      2.872752        2.348471      54567.472872   
std     25185.690583      2.427450        2.237201      21755.536191   
min      2267.000000      0.000000        0.000000      17000.000000   
25%     23301.000000      1.000000        0.000000      36000.000000   
50%     42527.500000      2.000000        2.000000      52180.000000   
75%     62358.000000      4.000000        4.000000      71799.000000   
max    110889.000000     12.000000       11.000000     107282.000000   

         Fill Rate         PRCP         SNOW         SNWD         TMAX  \
count  6672.000000  6672.000000  6672.000000  6672.000000  6672.000000   
mean      0.793865     0.093230     0.009607     0.015812    71.923711   
std       0.221910     0.337667     0.192520     0.255716    14.457450   
min       0.067483     0.000000     0.000000     0.0000

(6672, 25)

In [3]:
print(df.columns)
df.head()

Index(['Date', 'Team', 'Time', 'Opponent', 'Rank', 'Site', 'TV', 'Result',
       'Attendance', 'Current Wins', 'Current Losses', 'Stadium Capacity',
       'Fill Rate', 'New Coach', 'Tailgating', 'PRCP', 'SNOW', 'SNWD', 'TMAX',
       'TMIN', 'Opponent_Rank', 'Conference', 'Year', 'Month', 'Day'],
      dtype='object')


Unnamed: 0,Date,Team,Time,Opponent,Rank,Site,TV,Result,Attendance,Current Wins,...,PRCP,SNOW,SNWD,TMAX,TMIN,Opponent_Rank,Conference,Year,Month,Day
0,9/2/2000,Arkansas,8:00 PM,Southwest Missouri State*,NR,"War Memorial StadiumLittle Rock, AR",Not on TV,W 380,53946,0,...,0.0,0.0,0.0,105,65,NR,SEC,2000,9,2
1,9/16/2000,Arkansas,6:00 PM,Boise State*,NR,"War Memorial StadiumLittle Rock, AR",Not on TV,W 3831,54286,1,...,0.0,0.0,0.0,79,44,NR,SEC,2000,9,16
2,9/23/2000,Arkansas,8:00 PM,Alabama,NR,"Razorback StadiumFayetteville, AR",ESPN2,W 2821,51482,2,...,2.12,0.0,0.0,85,63,NR,SEC,2000,9,23
3,9/30/2000,Arkansas,11:30 AM,No. 25 Georgia,NR,"Razorback StadiumFayetteville, AR",JPS,L 738,51162,3,...,0.0,0.0,0.0,77,45,25,SEC,2000,9,30
4,10/7/2000,Arkansas,6:00 PM,LouisianaMonroe*,NR,"Razorback StadiumFayetteville, AR",Not on TV,W 526,50947,3,...,0.0,0.0,0.0,50,28,NR,SEC,2000,10,7


In [4]:
df.columns = [feature.strip() for feature in df.columns]
colNA = []
for col in df.columns:
    colNA.append(df[col].isna().sum())
print(colNA)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
print(pd.unique(df['TMAX'].values[:15]))


[105  79  85  77  50  55  49  84  62  59  65  66  93  80]


In [6]:
df.drop(labels=['Date', 'Site', 'TV', 'Result', 'Stadium Capacity', 
                'TMAX', 'TMIN', 'Year', 'Day', 'Opponent'], axis=1, inplace=True)
print(df.columns)
df.head()

Index(['Team', 'Time', 'Rank', 'Attendance', 'Current Wins', 'Current Losses',
       'Fill Rate', 'New Coach', 'Tailgating', 'PRCP', 'SNOW', 'SNWD',
       'Opponent_Rank', 'Conference', 'Month'],
      dtype='object')


Unnamed: 0,Team,Time,Rank,Attendance,Current Wins,Current Losses,Fill Rate,New Coach,Tailgating,PRCP,SNOW,SNWD,Opponent_Rank,Conference,Month
0,Arkansas,8:00 PM,NR,53946,0,0,1.004076,False,False,0.0,0.0,0.0,NR,SEC,9
1,Arkansas,6:00 PM,NR,54286,1,0,1.010404,False,False,0.0,0.0,0.0,NR,SEC,9
2,Arkansas,8:00 PM,NR,51482,2,0,1.029249,False,False,2.12,0.0,0.0,NR,SEC,9
3,Arkansas,11:30 AM,NR,51162,3,0,1.022851,False,False,0.0,0.0,0.0,25,SEC,9
4,Arkansas,6:00 PM,NR,50947,3,1,1.018553,False,False,0.0,0.0,0.0,NR,SEC,10


In [7]:
print(len(pd.unique(df['Rank'].values)))
print(len(pd.unique(df['Team'].values)))
print(pd.unique(df['New Coach'].values))
print(pd.unique(df['Tailgating'].values))
print(len(pd.unique(df['PRCP'].values)))
print(pd.unique(df['SNOW'].values))
print(pd.unique(df['SNWD'].values))
print(pd.unique(df['Conference'].values))

26
63
[False  True]
[False  True]
192
[0.  0.2 0.6 2.  1.  1.5 5.  5.3 0.4 1.7 0.8 2.6 4.5 1.9 0.5 1.4 0.1 2.5
 3.  2.3 0.3 8.2 6. ]
[0.  1.  5.  7.1 2.  7.  3.  3.9 1.2 6.  3.1]
['SEC' 'Big-12' 'WAC' 'MWC' 'Independent' 'ACC' 'Pac-12' 'Big-10'
 'Mid-American' 'CUSA' 'AAC' 'Big East' 'Sun Belt' 'FCS']


In [8]:
#Formatting Time Feature
df['Time'] = df['Time'].apply(lambda x: (int(str(x)[:-6]) * 60) + int(str(x)[-5:-2]) + 720 
                              if str(x)[-2:] == 'PM' else (int(str(x)[:-6]) * 60) + int(str(x)[-5:-2]))
print(df['Time'].values[:5])

[1200 1080 1200  690 1080]


In [9]:
tempDF = df[['Team', 'Rank', 'New Coach', 'Tailgating', 'Opponent_Rank', 'Conference', 'Month']]
df.drop(labels=tempDF.columns, axis=1, inplace=True)
print(tempDF.head())
df.head()

       Team Rank  New Coach  Tailgating Opponent_Rank Conference  Month
0  Arkansas   NR      False       False            NR        SEC      9
1  Arkansas   NR      False       False            NR        SEC      9
2  Arkansas   NR      False       False            NR        SEC      9
3  Arkansas   NR      False       False            25        SEC      9
4  Arkansas   NR      False       False            NR        SEC     10


Unnamed: 0,Time,Attendance,Current Wins,Current Losses,Fill Rate,PRCP,SNOW,SNWD
0,1200,53946,0,0,1.004076,0.0,0.0,0.0
1,1080,54286,1,0,1.010404,0.0,0.0,0.0
2,1200,51482,2,0,1.029249,2.12,0.0,0.0
3,690,51162,3,0,1.022851,0.0,0.0,0.0
4,1080,50947,3,1,1.018553,0.0,0.0,0.0


In [10]:
#LabelEncoding
le = LabelEncoder()
for col in tempDF.columns:
    tempDF[col] = le.fit_transform(tempDF[col].values)
tempDF.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Team,Rank,New Coach,Tailgating,Opponent_Rank,Conference,Month
0,2,25,0,0,25,11,3
1,2,25,0,0,25,11,3
2,2,25,0,0,25,11,3
3,2,25,0,0,17,11,3
4,2,25,0,0,25,11,4


In [11]:
#Which Columns need to be Standardized then Normalized, RobustScaler since don't want influence of outliers
#Using normalize to not allow influence of outliers
#Then applying Isomap before recombining non-continuous data features
robust = RobustScaler()
df = robust.fit_transform(df)
df = normalize(df)

iso = Isomap(n_neighbors = 6, n_components=3, path_method='auto', neighbors_algorithm = 'kd_tree')
df = pd.DataFrame(iso.fit_transform(df))
df.head()

Unnamed: 0,0,1,2
0,-1.145413,0.373278,1.790735
1,-1.570223,-0.214333,1.603847
2,1.814543,0.196495,0.208593
3,-0.782014,-1.19014,-0.8806
4,-0.519279,-1.669643,0.649315


In [12]:
df = pd.concat([pd.DataFrame(tempDF), df], axis=1, join='inner')
df.head()

Unnamed: 0,Team,Rank,New Coach,Tailgating,Opponent_Rank,Conference,Month,0,1,2
0,2,25,0,0,25,11,3,-1.145413,0.373278,1.790735
1,2,25,0,0,25,11,3,-1.570223,-0.214333,1.603847
2,2,25,0,0,25,11,3,1.814543,0.196495,0.208593
3,2,25,0,0,17,11,3,-0.782014,-1.19014,-0.8806
4,2,25,0,0,25,11,4,-0.519279,-1.669643,0.649315


In [13]:
cv = RepeatedKFold(n_splits=5, n_repeats=2)
param_grid = dict()
param_grid['n_clusters'] = list(np.arange(8, 12))
param_grid['n_init'] = list(np.arange(8, 12))
param_grid['algorithm'] = ['full', 'elkan']
km = KMeans(init='k-means++', max_iter=500)
model = GridSearchCV(km, param_grid=param_grid, cv=cv, n_jobs=-1)

In [14]:
model.fit(df)
model.best_estimator_

KMeans(algorithm='full', copy_x=True, init='k-means++', max_iter=500,
       n_clusters=11, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [15]:
ypred = model.predict(df)
sil = silhouette_score(df, ypred)
print('Silhouette Score: ', sil)

Silhouette Score:  0.3479961846598114
