In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#Parameters
file = "../tables/2021AP.xlsx"

#Read in df
df = pd.read_excel(file)
df

Unnamed: 0,Team,Week,Rank,W,L,Winning Percentage,Opp. Rank,Opp. P5,Home,Result,Points Scored,Points Against,Margin,Next Week Rank,Movement
0,Alabama,1,1,0,0,1,14,1,0,W,44,13,31,1,0
1,Oklahoma,1,2,0,0,1,26,0,1,W,40,35,5,4,-2
2,Clemson,1,3,0,0,1,5,1,0,L,3,10,-7,6,-3
3,Ohio State,1,4,0,0,1,26,1,0,W,45,31,14,3,1
4,Georgia,1,5,0,0,1,3,1,0,W,10,3,7,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,Auburn,5,22,3,1,0.75,,,,,,,,,
122,NC State,5,23,3,1,0.75,,,,,,,,,
123,Wake Forest,5,24,4,0,1,,,,,,,,,
124,Clemson,5,25,2,2,0.5,,,,,,,,,


In [3]:
#Drop footer
df.drop(index = (df.index.stop - 1), inplace = True)

#Drop null columns (teams with byes)
df.dropna(inplace = True)

In [4]:
#Make W/L binary values
for index, row in df.iterrows():
    if row['Result'] == 'W':
        df.at[index, 'Result'] = 1
    else: 
        df.at[index, 'Result'] = 0

In [5]:
df = df.drop(columns = ["Movement", "Team"])

In [6]:
for name, values in df.iteritems():
    df[name] = pd.to_numeric(values)
    
df.dtypes

Week                    int64
Rank                    int64
W                       int64
L                       int64
Winning Percentage    float64
Opp. Rank               int64
Opp. P5                 int64
Home                    int64
Result                  int64
Points Scored           int64
Points Against          int64
Margin                  int64
Next Week Rank          int64
dtype: object

In [7]:
#Group rankings to categories
for index, row in df.iterrows():
    if row['Next Week Rank'] == 26:
        df.at[index, 'Next Week Rank Category'] = 0
    elif row['Next Week Rank'] < row['Rank']:
        df.at[index, 'Next Week Rank Category'] = 1
    elif row['Next Week Rank'] > row['Rank']:
        df.at[index, 'Next Week Rank Category'] = 2
    elif row['Next Week Rank'] == row['Rank']:
        df.at[index, 'Next Week Rank Category'] = 3        

In [8]:
df

Unnamed: 0,Week,Rank,W,L,Winning Percentage,Opp. Rank,Opp. P5,Home,Result,Points Scored,Points Against,Margin,Next Week Rank,Next Week Rank Category
0,1,1,0,0,1.000000,14,1,0,1,44,13,31,1,3.0
1,1,2,0,0,1.000000,26,0,1,1,40,35,5,4,2.0
2,1,3,0,0,1.000000,5,1,0,0,3,10,-7,6,2.0
3,1,4,0,0,1.000000,26,1,0,1,45,31,14,3,1.0
4,1,5,0,0,1.000000,3,1,0,1,10,3,7,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4,21,2,1,0.666667,26,1,0,0,22,45,-23,26,0.0
96,4,22,3,1,0.750000,26,0,1,1,38,30,8,18,1.0
97,4,23,2,1,0.666667,26,0,1,1,34,24,10,22,1.0
98,4,24,2,1,0.666667,26,1,0,1,35,24,11,20,1.0


In [9]:
X = df.drop(columns = ['Next Week Rank', 'Next Week Rank Category'])
y = df['Next Week Rank Category']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 8)
X_train.shape

(72, 12)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
#Decision Tree classifier
from sklearn import tree
dtc = tree.DecisionTreeClassifier(random_state = 8)
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)

0.5833333333333334

In [13]:
#Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 8)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.625

In [14]:
#Support Vector Machine classifier
from sklearn.svm import SVC
svm = SVC(random_state = 8)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.7083333333333334

In [15]:
#Scaled SVMC
svm = SVC(random_state = 8)
svm.fit(X_train_scaled, y_train)
svm.score(X_test_scaled, y_test)

0.625

In [16]:
#K-means classifier
from sklearn.cluster import KMeans
knc = KMeans(n_clusters = 6, random_state = 8)
knc.fit(X_train, y_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=8, tol=0.0001, verbose=0)

In [17]:
results = pd.DataFrame([knc.predict(X_test), y_test])
results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,3.0,2.0,4.0,3.0,4.0,0.0,4.0,3.0,3.0,4.0,...,3.0,4.0,4.0,3.0,2.0,4.0,2.0,1.0,1.0,2.0
1,1.0,2.0,2.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,0.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0


In [18]:
knc.score(X_test, y_test)

-4562.620636311136

In [19]:
#Scaled K-means classifier
knc = KMeans(n_clusters = 6, random_state = 8)
knc.fit(X_train_scaled, y_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=8, tol=0.0001, verbose=0)

In [20]:
results = pd.DataFrame([knc.predict(X_test), y_test])
results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,5.0,2.0,5.0,5.0,2.0,5.0,2.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,2.0,2.0
1,1.0,2.0,2.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,0.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0


In [21]:
knc.score(X_test, y_test)

-64291.1305172406