## Using a Ridge Regression Machine Learning Model to make Predictions

In [49]:
import pandas as pd
from sklearn.linear_model import Ridge

In [50]:
ml_df = pd.read_csv("CSV/merged_stats_scores.csv")
del ml_df["Unnamed: 0"]
ml_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,TOV%,ORB%,FT/FGA,eFG%_opp,TOV%_opp,DRB%,FT/FGA_opp,Year,Team,Score
0,0.122449,0.676259,0.551724,0.747573,0.312925,0.239474,0.847561,0.787879,0.706044,0.487013,...,0.395062,1.000000,0.387435,0.418440,0.666667,0.438503,0.349462,1996,CHI,10.0
1,0.489796,0.482014,0.297414,0.766990,0.346939,0.318421,0.609756,0.553030,0.458791,0.636364,...,0.777778,0.621053,0.759162,0.397163,0.735632,0.352941,0.510753,1996,OKC,7.0
2,0.428571,0.597122,0.422414,0.786408,0.380952,0.334211,0.695122,0.636364,0.521978,0.623377,...,0.407407,0.584211,0.465969,0.482270,0.425287,0.315508,0.354839,1996,ORL,4.0
3,0.244898,0.532374,0.306034,0.844660,0.176871,0.131579,0.658537,0.795455,0.659341,0.571429,...,0.518519,0.721053,0.696335,0.468085,0.551724,0.470588,0.741935,1996,UTA,4.0
4,0.061224,0.769784,0.737069,0.669903,0.741497,0.710526,0.670732,0.416667,0.329670,0.714286,...,0.506173,0.847368,0.450262,0.631206,0.689655,0.224599,0.322581,1996,ATL,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,0.244898,0.539568,0.737069,0.320388,0.693878,0.776316,0.408537,0.227273,0.260989,0.558442,...,0.382716,0.100000,0.167539,0.773050,0.160920,0.764706,0.209677,2022,ORL,0.0
797,0.122449,0.553957,0.685345,0.398058,0.727891,0.773684,0.500000,0.204545,0.230769,0.597403,...,0.382716,0.263158,0.235602,0.964539,0.275862,0.748663,0.349462,2022,POR,0.0
798,0.306122,0.697842,0.728448,0.572816,0.639456,0.678947,0.487805,0.454545,0.357143,0.707792,...,0.320988,0.178947,0.314136,0.907801,0.183908,0.700535,0.102151,2022,SAC,0.0
799,0.306122,0.892086,0.926724,0.640777,0.632653,0.647368,0.536585,0.674242,0.516484,0.688312,...,0.148148,0.305263,0.120419,0.773050,0.206897,0.652406,0.102151,2022,SAS,0.0


**Indicating which columns the model will be trained on**

In [51]:
not_features = ["Year", "Team", "Score"]

features = ml_df.columns[~ml_df.columns.isin(not_features)]

In [52]:
features

Index(['MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'MOV', 'SOS', 'SRS', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr',
       '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%_opp', 'TOV%_opp',
       'DRB%', 'FT/FGA_opp'],
      dtype='object')

In [53]:
features = list(ml_df[features])
features

['MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'MOV',
 'SOS',
 'SRS',
 'ORtg',
 'DRtg',
 'NRtg',
 'Pace',
 'FTr',
 '3PAr',
 'TS%',
 'eFG%',
 'TOV%',
 'ORB%',
 'FT/FGA',
 'eFG%_opp',
 'TOV%_opp',
 'DRB%',
 'FT/FGA_opp']

In [54]:
seasons = list(range(1996, 2023))

In [55]:
def backtest(ml_data, features, score, start=2, step=1):
    total_predictions = []

    for i in range(start, len(seasons), step):
        season = seasons[i]

        # alpha value controls how much regularization the model uses
        ridge_model = Ridge(alpha=1)

        # dividing the ml_data into training and testing data
        train_set = ml_data[ml_data["Year"] < season]

        test_set = ml_data[ml_data["Year"] == season]

        # fits the ridge regression model: adjusts the model paramters to minimize difference between 
        # the actual and predicted output
        ridge_model = ridge_model.fit(train_set[features], train_set[score])

        # Use the model to predict the output given a set of inputs
        predictions = ridge_model.predict(test_set[features])

        # Converting the predictions to a pandas series and joining that with part of the test_set dataframe
        predictions = pd.Series(predictions, index=test_set.index)

        joined_df = pd.concat([test_set[["Team", "Score", "Year"]], predictions], axis=1)

        # Renaming columns of the new dataframe
        joined_df.columns = ["Team", "Actual_Score", "Year", "Predicted_Score"]

        total_predictions.append(joined_df)

    return pd.concat(total_predictions)


In [56]:
results = backtest(ml_df, features, "Score")

In [57]:
# Sorting data from oldest to latest season and then sorting based on who was predicted the 
# highest playoff score

results = results.sort_values(["Year", "Predicted_Score"], ascending=[True, False])

In [58]:
results.to_csv("results_ridge.csv")

## Seeing how well the model did

**Calculating Mean Squared Error**

In [59]:
from sklearn.metrics import mean_squared_error

In [60]:
ridge_df = pd.read_csv("CSV/results_ridge.csv")
del ridge_df["Unnamed: 0"]

In [61]:
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score,Actual_Rank,Predicted_Rank
0,1998,ATL,1.0,3.245676,12,1
1,1998,LAL,4.0,3.167176,4,2
2,1998,OKC,2.0,3.080105,6,3
3,1998,UTA,7.0,2.906798,2,4
4,1998,PHO,1.0,2.655632,15,5
...,...,...,...,...,...,...
738,2022,OKC,0.0,0.439894,17,26
739,2022,SAC,0.0,0.401657,19,27
740,2022,HOU,0.0,-0.139033,18,28
741,2022,ORL,0.0,-0.257555,21,29


In [62]:
mean_squared_error(ridge_df["Actual_Score"], ridge_df["Predicted_Score"])

3.487277160611736

**Some further analysis**

In [63]:
ridge_df
del ridge_df["Actual_Rank"]
del ridge_df["Predicted_Rank"]

In [64]:
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score
0,1998,ATL,1.0,3.245676
1,1998,LAL,4.0,3.167176
2,1998,OKC,2.0,3.080105
3,1998,UTA,7.0,2.906798
4,1998,PHO,1.0,2.655632
...,...,...,...,...
738,2022,OKC,0.0,0.439894
739,2022,SAC,0.0,0.401657
740,2022,HOU,0.0,-0.139033
741,2022,ORL,0.0,-0.257555


**Indicating the position/rank each team actually ended up in based on their actual playoff score**<br>
**NOTE: This ranking isn't enirely accurate as the teams that missed the playoffs entirely has a score of 0 so technically they're equal**<br>
**which doesn't reflect the ranking**

In [65]:
def add_pos_rank(df, type_rk):
    if type_rk == "Actual_Rank":
        df = df.sort_values(["Actual_Score"], ascending=False)
        actual_rank = list(range(1, df.shape[0] + 1))
        df.insert(2, "Actual_Rank", actual_rank)
    elif type_rk == "Predicted_Rank":
        df = df.sort_values(["Predicted_Score"], ascending=False)
        predicted_rank = list(range(1, df.shape[0] + 1))
        df.insert(5, "Predicted_Rank", predicted_rank)
    return df

In [66]:
ridge_df = ridge_df.groupby("Year").apply(add_pos_rank, type_rk = "Actual_Rank")

In [67]:
ridge_df.index = ridge_df.index.droplevel()
ridge_df

Unnamed: 0,Year,Team,Actual_Rank,Actual_Score,Predicted_Score
6,1998,CHI,1,10.0,2.190159
3,1998,UTA,2,7.0,2.906798
7,1998,IND,3,4.0,1.704946
1,1998,LAL,4,4.0,3.167176
17,1998,NYK,5,2.0,0.801900
...,...,...,...,...,...
730,2022,LAC,26,0.0,1.645079
729,2022,WAS,27,0.0,1.669350
724,2022,CHO,28,0.0,2.137922
721,2022,CLE,29,0.0,2.485331


In [68]:
# Renaming a column in a pandas dataframe
# ridge_df = ridge_df.rename(columns={"Actual Rank": "Actual_Rank"})

In [69]:
# Reordering columns in a pandas dataframe
# ridge_df = ridge_df[["Year", "Team", "Actual_Score", "Predicted_Score", "Actual_Rank"]]

**Indicating the position/rank each team was predicted to end up in based on their predicted playoff score**

In [70]:
ridge_df = ridge_df.groupby("Year").apply(add_pos_rank, type_rk = "Predicted_Rank")
ridge_df.index = ridge_df.index.droplevel()
ridge_df

Unnamed: 0,Year,Team,Actual_Rank,Actual_Score,Predicted_Score,Predicted_Rank
0,1998,ATL,12,1.0,3.245676,1
1,1998,LAL,4,4.0,3.167176,2
2,1998,OKC,6,2.0,3.080105,3
3,1998,UTA,2,7.0,2.906798,4
4,1998,PHO,15,1.0,2.655632,5
...,...,...,...,...,...,...
738,2022,OKC,17,0.0,0.439894,26
739,2022,SAC,19,0.0,0.401657,27
740,2022,HOU,18,0.0,-0.139033,28
741,2022,ORL,21,0.0,-0.257555,29


In [71]:
ridge_df = ridge_df[["Year", "Team", "Actual_Score", "Predicted_Score", "Actual_Rank", "Predicted_Rank"]]
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score,Actual_Rank,Predicted_Rank
0,1998,ATL,1.0,3.245676,12,1
1,1998,LAL,4.0,3.167176,4,2
2,1998,OKC,2.0,3.080105,6,3
3,1998,UTA,7.0,2.906798,2,4
4,1998,PHO,1.0,2.655632,15,5
...,...,...,...,...,...,...
738,2022,OKC,0.0,0.439894,17,26
739,2022,SAC,0.0,0.401657,19,27
740,2022,HOU,0.0,-0.139033,18,28
741,2022,ORL,0.0,-0.257555,21,29


In [72]:
ridge_df.to_csv("CSV/results_ridge.csv")