# MACHINE LEARNING: MULTIPLE LINEAR REGRESSION

In [1]:
import pandas as pd
import os
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
teams_68 = pd.read_csv(os.path.join('final_dataset68.csv')).fillna(0).drop(columns=['Unnamed: 0'])
teams_353 = pd.read_csv(os.path.join('final_dataset353.csv')).fillna(0).drop(columns=['Unnamed: 0'])

# 68 TEAM DATASET

In [3]:
x = teams_68.drop(columns=['Name','TourneyWins']).values
y = teams_68['TourneyWins'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [4]:
model = LinearRegression()

In [None]:
x_scaler = StandardScaler().fit(x_train.reshape(-1, 1))

In [None]:
x_scaled = x_scaler.transform(x)
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

### Fit and score without splitting for cross validation

In [5]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}\nSeed: {coef_naive[14]}\n')

R2: 0.5272154108898714
 
Coefficients: 
SeasonWins: 0.008456313383308309
Points: 0.22357546548185886
FGM: -0.5309295402999872
FGM3: -0.31505792860983234
FTA: -0.1878099895918294
Off Rebounds: -0.030116588607987646
Def Rebounds: -0.09355759260266994
Assists: 0.11661837546001975
Turnovers: -0.08459971022514032
Steals: 0.010440905486745028
Blocks: 0.20281834090527334
Fouls: 0.053694515996766064
YearsExp: -0.00323960792283752
Conf_Champs: 0.5290280243635578
Seed: -0.20290186777450636



### Fit on training data

In [6]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}\nSeed: {coef_trained[14]}\n')

R2: 0.5298862015080243
 
Coefficients: 
SeasonWins: -0.03214825170685589
Points: 0.2839378652152983
FGM: -0.7272976736918562
FGM3: -0.33981701131308784
FTA: -0.24208944201088312
Off Rebounds: -0.007991976765165859
Def Rebounds: -0.0023063796574829334
Assists: 0.19096312529696674
Turnovers: -0.29127079610280027
Steals: -0.01976781308255289
Blocks: 0.21088383272955308
Fouls: 0.10860863604901488
YearsExp: -0.0007330969395003828
Conf_Champs: 0.5826671442827136
Seed: -0.19660643822090987



### Score with testing data

In [7]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

MSE: 1.361509522692835
Test R2: 0.36536088377704956


## Drop Seed from X (Many X variables are taken into account when seeding)

In [None]:
x = teams_68.drop(columns=['Name','TourneyWins','WSeed'])
y = teams_68['TourneyWins']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [None]:
model = LinearRegression()

### Fit and score without splitting for cross validation

In [None]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}')

In [None]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}')

### Score with testing data

In [None]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

# 353 TEAM DATASET

In [None]:
x = teams_353.drop(columns=['Name','TourneyWins'])
y = teams_353['TourneyWins']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [None]:
model = LinearRegression()

### Fit and score without splitting for cross validation

In [None]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}\nSeed: {coef_naive[14]}\n')

### Fit on training data

In [None]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}\nSeed: {coef_trained[14]}\n')

### Score with testing data

In [None]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

# PREDICTIONS

In [8]:
virginia = teams_68.loc[teams_68['Name'] == "Virginia"].drop(columns=['Name','TourneyWins'])
unc = teams_68.loc[teams_68['Name'] == 'North Carolina'].drop(columns=['Name','TourneyWins'])
duke = teams_68.loc[teams_68['Name'] == 'Duke'].drop(columns=['Name','TourneyWins'])

In [13]:
virginia_vector = virginia.values
predicted_class = model_trained.predict(virginia_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 6")

Prediction: [3.13871686]
Actual: 6


In [14]:
unc_vector = unc.values
predicted_class = model_trained.predict(unc_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 2")

Prediction: [1.99569747]
Actual: 2


In [15]:
duke_vector = duke.values
predicted_class = model_trained.predict(duke_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 3")

Prediction: [2.2074207]
Actual: 3


In [None]:
new_team = [[32, 100, 30, 10, 24, 15, 25, 20, 0, 15, 15, 2, 35, 1, 1]]

model_trained.predict(new_team)