In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor as abr

In [2]:
#Parameters
file = "../tables/2021AP.xlsx"
next_week = 8

#Min and max highest ranks
a = 26
b = 51

In [3]:
def run_regressions(top_rank):
    df = pd.read_excel(file)
    
    #Drop footer
    df.drop(index = (df.index.stop - 1), inplace = True)

    #Drop null columns (teams with byes)
    df.dropna(inplace = True)

    #Set week
    for index, row in df.iterrows():
        if row['Week'] >= next_week:
            df.drop(index = index, inplace = True)
    
    #Convert Result col to dummy numeric
    for index, row in df.iterrows():
        if row['Result'] == 'W':
            df.at[index, 'Result'] = 1
        else: 
            df.at[index, 'Result'] = 0      
    
    #Increase penalty for unranked
    for index,row in df.iterrows():
        if row['Next Week Rank'] == 26:
            df.at[index, 'Next Week Rank'] = top_rank
    
    #Drop unnecessary cols
    df = df.drop(columns = ["Movement", "Team"])
    
    #Convert all cols to num
    for name, values in df.iteritems():
        df[name] = pd.to_numeric(values)
    
    #Get features/target
    X = df.drop(columns = ['Next Week Rank'])
    y = df["Next Week Rank"]
    
    #Split to train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 8)
    
    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    #Multiple Linear Regression
    lin_model = LinearRegression().fit(X_train, y_train)
    mlr_score = lin_model.score(X_test, y_test)
    
    #Scaled Multiple Linear Regression
    scaled_lin_model = LinearRegression().fit(X_train_scaled, y_train)
    smlr_score = scaled_lin_model.score(X_test_scaled, y_test)
    
    #RF
    rf_model = RandomForestRegressor(random_state = 8)
    rf_model.fit(X_train, y_train)
    rf_score = rf_model.score(X_test, y_test)
    
    #Scaled RF
    scaled_rf_model = RandomForestRegressor(random_state = 8)
    scaled_rf_model.fit(X_train_scaled, y_train)
    srf_score = scaled_rf_model.score(X_test_scaled, y_test)
    
    #AdaBoost Regression
    ab_model = abr(random_state = 8).fit(X_train, y_train)
    ab_score = ab_model.score(X_test, y_test)
    
    #Scaled AdaBoost Regression
    scaled_ab_model = abr(random_state = 8).fit(X_train_scaled, y_train)
    sab_score = scaled_ab_model.score(X_test_scaled, y_test)
    
    #output = f'Week {next_week} Model Scores - Top Rank = {top_rank}\nMLR: {mlr_score}\nSMLR: {smlr_score}\nRF: {rf_score}\nSRF: {srf_score}\nABR: {ab_score}\nSABR: {sab_score}'
    
    return mlr_score, smlr_score, rf_score, srf_score, ab_score, sab_score

In [4]:
scores_dict = {}
for item in ['mlr', 'smlr', 'rf', 'srf', 'ab', 'sab']:
    scores_dict[item] = []

In [None]:
for top_rank in range(a, b):
    mlr_score, smlr_score, rf_score, srf_score, ab_score, sab_score = run_regressions(top_rank)
    
    scores_dict['mlr'].append(mlr_score)
    scores_dict['smlr'].append(smlr_score)
    scores_dict['rf'].append(rf_score)
    scores_dict['srf'].append(srf_score)
    scores_dict['ab'].append(ab_score)
    scores_dict['sab'].append(sab_score)

In [None]:
data = pd.DataFrame()

data['top_rank'] = range(a, b)

for item in ['mlr', 'smlr', 'rf', 'srf', 'ab', 'sab']:
    data[item] = scores_dict[item]

In [None]:
import plotly.express as px

fig = px.line(data, x='top_rank', y=['mlr', 'smlr', 'rf', 'srf', 'ab', 'sab'])
fig.update_layout(title = 'R2 of Models with Varying Max Rank', xaxis_title = 'Top Rank', yaxis_title = 'R2', title_x = .5)
fig.update_xaxes(dtick = 2)
fig.show()

In [None]:
data.describe()

In [None]:
dataT = data.set_index('top_rank').transpose().describe()

In [None]:
dataTT = dataT.transpose().reset_index()
dataTT

In [None]:
fig = px.line(dataTT, x='top_rank', y=['mean', '50%'])
fig.update_layout(title = 'Average R2 of Models with Varying Max Rank', xaxis_title = 'Top Rank', yaxis_title = 'R2', title_x = .5)
fig.update_xaxes(dtick = 2)
fig.show()