## Using a Ridge Regression Machine Learning Model to make Predictions

In [2]:
import pandas as pd
from sklearn.linear_model import Ridge

In [11]:
ml_df = pd.read_csv("CSV/merged_stats_scores.csv")
del ml_df["Unnamed: 0"]
ml_df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team,Score
0,0.122449,0.676259,0.551724,0.747573,0.312925,0.239474,0.847561,0.787879,0.706044,0.487013,...,0.559006,0.600000,0.553846,0.310345,0.405063,0.514286,0.609948,1996,CHI,10.0
1,0.489796,0.482014,0.297414,0.766990,0.346939,0.318421,0.609756,0.553030,0.458791,0.636364,...,0.366460,0.571429,0.815385,0.413793,0.822785,0.704762,0.591623,1996,OKC,7.0
2,0.428571,0.597122,0.422414,0.786408,0.380952,0.334211,0.695122,0.636364,0.521978,0.623377,...,0.341615,0.642857,0.400000,0.448276,0.379747,0.400000,0.591623,1996,ORL,4.0
3,0.244898,0.532374,0.306034,0.844660,0.176871,0.131579,0.658537,0.795455,0.659341,0.571429,...,0.335404,0.692857,0.400000,0.465517,0.468354,0.800000,0.539267,1996,UTA,4.0
4,0.061224,0.769784,0.737069,0.669903,0.741497,0.710526,0.670732,0.416667,0.329670,0.714286,...,0.521739,0.585714,0.261538,0.310345,0.101266,0.200000,0.837696,1996,ATL,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,0.244898,0.539568,0.737069,0.320388,0.693878,0.776316,0.408537,0.227273,0.260989,0.558442,...,0.540373,0.521429,0.200000,0.362069,0.430380,0.295238,0.583770,2022,ORL,0.0
797,0.122449,0.553957,0.685345,0.398058,0.727891,0.773684,0.500000,0.204545,0.230769,0.597403,...,0.453416,0.464286,0.384615,0.362069,0.430380,0.428571,0.636126,2022,POR,0.0
798,0.306122,0.697842,0.728448,0.572816,0.639456,0.678947,0.487805,0.454545,0.357143,0.707792,...,0.453416,0.521429,0.261538,0.362069,0.379747,0.219048,0.743455,2022,SAC,0.0
799,0.306122,0.892086,0.926724,0.640777,0.632653,0.647368,0.536585,0.674242,0.516484,0.688312,...,0.602484,0.821429,0.323077,0.431034,0.202532,0.142857,0.819372,2022,SAS,0.0


**Indicating which columns the model will be trained on**

In [12]:
not_features = ["Year", "Team", "Score"]

features = ml_df.columns[~ml_df.columns.isin(not_features)]

In [13]:
features

Index(['MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS'],
      dtype='object')

In [14]:
features = list(ml_df[features])
features

['MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [15]:
seasons = list(range(1996, 2023))

In [16]:
def backtest(ml_data, features, score, start=2, step=1):
    total_predictions = []

    for i in range(start, len(seasons), step):
        season = seasons[i]

        # alpha value controls how much regularization the model uses
        ridge_model = Ridge(alpha=1)

        # dividing the ml_data into training and testing data
        train_set = ml_data[ml_data["Year"] < season]

        test_set = ml_data[ml_data["Year"] == season]

        # fits the ridge regression model: adjusts the model paramters to minimize difference between 
        # the actual and predicted output
        ridge_model = ridge_model.fit(train_set[features], train_set[score])

        # Use the model to predict the output given a set of inputs
        predictions = ridge_model.predict(test_set[features])

        # Converting the predictions to a pandas series and joining that with part of the test_set dataframe
        predictions = pd.Series(predictions, index=test_set.index)

        joined_df = pd.concat([test_set[["Team", "Score", "Year"]], predictions], axis=1)

        # Renaming columns of the new dataframe
        joined_df.columns = ["Team", "Actual_Score", "Year", "Predicted_Score"]

        total_predictions.append(joined_df)

    return pd.concat(total_predictions)


In [17]:
results = backtest(ml_df, features, "Score")

In [18]:
# Sorting data from oldest to latest season and then sorting based on who was predicted the 
# highest playoff score

results = results.sort_values(["Year", "Predicted_Score"], ascending=[True, False])

In [19]:
results.to_csv("results_ridge.csv")

## Seeing how well the model did

**Calculating Mean Squared Error**

In [1]:
from sklearn.metrics import mean_squared_error

In [69]:
ridge_df = pd.read_csv("CSV/results_ridge.csv")
del ridge_df["Unnamed: 0"]

In [70]:
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score,Actual_Rank,Predicted_Rank
0,1998,ATL,1.0,3.245676,9,1
1,1998,LAL,4.0,3.167176,3,2
2,1998,OKC,2.0,3.080105,5,3
3,1998,UTA,7.0,2.906798,2,4
4,1998,PHO,1.0,2.655632,10,5
...,...,...,...,...,...,...
738,2022,OKC,0.0,0.439894,739,739
739,2022,SAC,0.0,0.401657,740,740
740,2022,HOU,0.0,-0.139033,741,741
741,2022,ORL,0.0,-0.257555,742,742


In [8]:
mean_squared_error(ridge_df["Actual_Score"], ridge_df["Predicted_Score"])

3.487277160611736

**Some further analysis**

In [71]:
ridge_df
del ridge_df["Actual_Rank"]
del ridge_df["Predicted_Rank"]

In [72]:
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score
0,1998,ATL,1.0,3.245676
1,1998,LAL,4.0,3.167176
2,1998,OKC,2.0,3.080105
3,1998,UTA,7.0,2.906798
4,1998,PHO,1.0,2.655632
...,...,...,...,...
738,2022,OKC,0.0,0.439894
739,2022,SAC,0.0,0.401657
740,2022,HOU,0.0,-0.139033
741,2022,ORL,0.0,-0.257555


**Indicating the position/rank each team actually ended up in based on their actual playoff score**

In [73]:
def add_pos_rank(df, type_rk):
    if type_rk == "Actual_Rank":
        df = df.sort_values(["Actual_Score"], ascending=False)
        actual_rank = list(range(1, df.shape[0] + 1))
        df.insert(2, "Actual_Rank", actual_rank)
    elif type_rk == "Predicted_Rank":
        df = df.sort_values(["Predicted_Score"], ascending=False)
        predicted_rank = list(range(1, df.shape[0] + 1))
        df.insert(5, "Predicted_Rank", predicted_rank)
    return df

In [74]:
ridge_df = ridge_df.groupby("Year").apply(add_pos_rank, type_rk = "Actual_Rank")

In [75]:
ridge_df.index = ridge_df.index.droplevel()
ridge_df

Unnamed: 0,Year,Team,Actual_Rank,Actual_Score,Predicted_Score
6,1998,CHI,1,10.0,2.190159
3,1998,UTA,2,7.0,2.906798
7,1998,IND,3,4.0,1.704946
1,1998,LAL,4,4.0,3.167176
17,1998,NYK,5,2.0,0.801900
...,...,...,...,...,...
730,2022,LAC,26,0.0,1.645079
729,2022,WAS,27,0.0,1.669350
724,2022,CHO,28,0.0,2.137922
721,2022,CLE,29,0.0,2.485331


In [23]:
# Renaming a column in a pandas dataframe
# ridge_df = ridge_df.rename(columns={"Actual Rank": "Actual_Rank"})

In [25]:
# Reordering columns in a pandas dataframe
# ridge_df = ridge_df[["Year", "Team", "Actual_Score", "Predicted_Score", "Actual_Rank"]]

**Indicating the position/rank each team was predicted to end up in based on their predicted playoff score**

In [76]:
ridge_df = ridge_df.groupby("Year").apply(add_pos_rank, type_rk = "Predicted_Rank")
ridge_df.index = ridge_df.index.droplevel()
ridge_df

Unnamed: 0,Year,Team,Actual_Rank,Actual_Score,Predicted_Score,Predicted_Rank
0,1998,ATL,12,1.0,3.245676,1
1,1998,LAL,4,4.0,3.167176,2
2,1998,OKC,6,2.0,3.080105,3
3,1998,UTA,2,7.0,2.906798,4
4,1998,PHO,15,1.0,2.655632,5
...,...,...,...,...,...,...
738,2022,OKC,17,0.0,0.439894,26
739,2022,SAC,19,0.0,0.401657,27
740,2022,HOU,18,0.0,-0.139033,28
741,2022,ORL,21,0.0,-0.257555,29


In [77]:
ridge_df = ridge_df[["Year", "Team", "Actual_Score", "Predicted_Score", "Actual_Rank", "Predicted_Rank"]]
ridge_df

Unnamed: 0,Year,Team,Actual_Score,Predicted_Score,Actual_Rank,Predicted_Rank
0,1998,ATL,1.0,3.245676,12,1
1,1998,LAL,4.0,3.167176,4,2
2,1998,OKC,2.0,3.080105,6,3
3,1998,UTA,7.0,2.906798,2,4
4,1998,PHO,1.0,2.655632,15,5
...,...,...,...,...,...,...
738,2022,OKC,0.0,0.439894,17,26
739,2022,SAC,0.0,0.401657,19,27
740,2022,HOU,0.0,-0.139033,18,28
741,2022,ORL,0.0,-0.257555,21,29


In [78]:
ridge_df.to_csv("results_ridge.csv")