# MACHINE LEARNING: MULTIPLE LINEAR REGRESSION

In [3]:
import pandas as pd
import os
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [4]:
teams_68 = pd.read_csv(os.path.join('final_dataset68.csv')).fillna(0).drop(columns=['Unnamed: 0'])
teams_353 = pd.read_csv(os.path.join('final_dataset353.csv')).fillna(0).drop(columns=['Unnamed: 0'])

# 68 TEAM DATASET

In [5]:
x = teams_68.drop(columns=['Name','TourneyWins'])
y = teams_68['TourneyWins']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [6]:
model = LinearRegression()

### Fit and score without splitting for cross validation

In [7]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}\nSeed: {coef_naive[14]}\n')

R2: 0.5272154108898714
 
Coefficients: 
SeasonWins: 0.008456313383308309
Points: 0.22357546548185886
FGM: -0.5309295402999872
FGM3: -0.31505792860983234
FTA: -0.1878099895918294
Off Rebounds: -0.030116588607987646
Def Rebounds: -0.09355759260266994
Assists: 0.11661837546001975
Turnovers: -0.08459971022514032
Steals: 0.010440905486745028
Blocks: 0.20281834090527334
Fouls: 0.053694515996766064
YearsExp: -0.00323960792283752
Conf_Champs: 0.5290280243635578
Seed: -0.20290186777450636



### Fit on training data

In [8]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}\nSeed: {coef_trained[14]}\n')

R2: 0.5298862015080248
 
Coefficients: 
SeasonWins: -0.032148251706856035
Points: 0.2839378652152987
FGM: -0.7272976736918573
FGM3: -0.3398170113130885
FTA: -0.24208944201088362
Off Rebounds: -0.007991976765165887
Def Rebounds: -0.002306379657482753
Assists: 0.1909631252969669
Turnovers: -0.2912707961028005
Steals: -0.01976781308255293
Blocks: 0.2108838327295538
Fouls: 0.10860863604901522
YearsExp: -0.0007330969395006048
Conf_Champs: 0.5826671442827142
Seed: -0.1966064382209102



### Score with testing data

In [9]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

MSE: 1.3615095226928304
Test R2: 0.36536088377705167


## Drop Seed from X (Many X variables are taken into account when seeding)

In [12]:
x = teams_68.drop(columns=['Name','TourneyWins','WSeed'])
y = teams_68['TourneyWins']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [13]:
model = LinearRegression()

### Fit and score without splitting for cross validation

In [14]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}')

R2: 0.41589747216336515
 
Coefficients: 
SeasonWins: 0.11019324576083249
Points: 0.4456656076528321
FGM: -1.0272135057033898
FGM3: -0.4357158980375268
FTA: -0.35229251625054037
Off Rebounds: 0.11599232097473558
Def Rebounds: -0.08514565554015624
Assists: 0.15486533389163853
Turnovers: -0.11004434048553773
Steals: 0.07539144736620679
Blocks: 0.3786454300849038
Fouls: -0.08286879604747005
YearsExp: 0.00023985500299522622
Conf_Champs: -0.393246309717818


In [15]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}')

R2: 0.4396714785816603
 
Coefficients: 
SeasonWins: 0.0639458929699065
Points: 0.43602342889308804
FGM: -1.06697970660902
FGM3: -0.3291408955823764
FTA: -0.3737392506698489
Off Rebounds: 0.14171861201649597
Def Rebounds: 0.03728321077621734
Assists: 0.24371680113359612
Turnovers: -0.27173505521510477
Steals: 0.08360004582094914
Blocks: 0.3611345310481574
Fouls: -0.08377700076194135
YearsExp: -0.014745021933997848
Conf_Champs: -0.5120445608509133


### Score with testing data

In [16]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

MSE: 1.9138111194945409
Test R2: 0.1079170749452868


# 353 TEAM DATASET

In [17]:
x = teams_353.drop(columns=['Name','TourneyWins'])
y = teams_353['TourneyWins']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42,)

In [18]:
model = LinearRegression()

### Fit and score without splitting for cross validation

In [19]:
model_naive = model.fit(x, y)

score_naive = model_naive.score(x, y)
coef_naive = model_naive.coef_

print(f"R2: {score_naive}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_naive[0]}\nPoints: {coef_naive[1]}\nFGM: {coef_naive[2]}\nFGM3: {coef_naive[3]}\nFTA: {coef_naive[4]}\nOff Rebounds: {coef_naive[5]}\nDef Rebounds: {coef_naive[6]}\nAssists: {coef_naive[7]}\nTurnovers: {coef_naive[8]}\nSteals: {coef_naive[9]}\nBlocks: {coef_naive[10]}\nFouls: {coef_naive[11]}\nYearsExp: {coef_naive[12]}\nConf_Champs: {coef_naive[13]}\nSeed: {coef_naive[14]}\n')

R2: 0.26081558615335276
 
Coefficients: 
SeasonWins: 0.041173198773778265
Points: 0.12091617582048202
FGM: -0.286801624314474
FGM3: -0.135963501218863
FTA: -0.11459231810810981
Off Rebounds: 0.03992460298830849
Def Rebounds: -0.0035367718125714556
Assists: 0.048910642540339416
Turnovers: -0.01737018290681564
Steals: 0.01576543888053094
Blocks: 0.13366667275333724
Fouls: 0.002546519037889595
YearsExp: 0.0023049427477746173
Conf_Champs: 0.08780543831064949
Seed: -0.011315566398081074



### Fit on training data

In [2]:
model_trained = model.fit(x_train, y_train)

train_score = model_trained.score(x_train, y_train)
coef_trained = model_trained.coef_

print(f"R2: {train_score}")
print(' ')
print(f'Coefficients: \nSeasonWins: {coef_trained[0]}\nPoints: {coef_trained[1]}\nFGM: {coef_trained[2]}\nFGM3: {coef_trained[3]}\nFTA: {coef_trained[4]}\nOff Rebounds: {coef_trained[5]}\nDef Rebounds: {coef_trained[6]}\nAssists: {coef_trained[7]}\nTurnovers: {coef_trained[8]}\nSteals: {coef_trained[9]}\nBlocks: {coef_trained[10]}\nFouls: {coef_trained[11]}\nYearsExp: {coef_trained[12]}\nConf_Champs: {coef_trained[13]}\nSeed: {coef_trained[14]}\n')

NameError: name 'model' is not defined

### Score with testing data

In [21]:
predictions = model_trained.predict(x_test)
test_score = model_trained.score(x_test, y_test)
MSE = mean_squared_error(y_test, predictions)

print(f"MSE: {MSE}")
print(f"Test R2: {test_score}")

MSE: 0.5291166125075055
Test R2: 0.18460453547238287


# PREDICTION

In [27]:
virginia = teams_68.loc[teams_68['Name'] == "Virginia"].drop(columns=['Name','TourneyWins'])
unc = teams_68.loc[teams_68['Name'] == 'North Carolina'].drop(columns=['Name','TourneyWins'])
duke = teams_68.loc[teams_68['Name'] == 'Duke'].drop(columns=['Name','TourneyWins'])

In [40]:
virginia_vector = virginia.values
predicted_class = model_trained.predict(virginia_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 6")

Prediction: [3.13871686]
Actual: 6


In [42]:
unc_vector = unc.values
predicted_class = model_trained.predict(unc_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 2")

Prediction: [1.99569747]
Actual: 2


In [41]:
duke_vector = duke.values
predicted_class = model_trained.predict(duke_vector)
print(f"Prediction: {predicted_class}")
print(f"Actual: 3")

Prediction: [2.2074207]
Actual: 3
