# Player DF  
###### Handling NAN and categorical data

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

playerDf = pd.read_csv("../../ncaa-players-2010-2021-yearly.csv")

#check which data are categorical
cat_features = [i for i in playerDf.columns if playerDf.dtypes[i] == 'object']

# playerDf['position'].unique()

def get_player_height(height):
    str_height = str(height)

    feet_inches = str_height.split('-')
    
    feet = feet_inches[0]
    if len(feet_inches) != 2:
        return 0

    height = 12 * int(feet) + int(feet_inches[1])
    return height

playerDf['height'] = playerDf.apply(lambda player: get_player_height(player.height), axis=1)

#replace 0 with mean value
playerDf['height'] = playerDf['height'].replace(0,playerDf['height'].mean())

def convert_catagorical_num(playerDf):
    #label encoding
    le = LabelEncoder()
    player_id = le.fit_transform(playerDf.player_id)
    team_abbreviation = le.fit_transform(playerDf.team_abbreviation)
    conference = le.fit_transform(playerDf.conference)
    position = le.fit_transform(playerDf.position)
    season = le.fit_transform(playerDf.season)
    #assign new data type
    playerDf = playerDf.assign(player_id=player_id,team_abbreviation=team_abbreviation,conference=conference,position=position,season=season)
    #replace nan position with most frequent position
    playerDf['position'].fillna(value=playerDf['position'].mode()[0],inplace=True)
    playerDf.fillna(value=0,inplace=True)
    return playerDf

playerDf = convert_catagorical_num(playerDf)
    

In [50]:
import plotly.express as plot
from sklearn.feature_selection import SelectKBest,f_regression


def feature_Selection_points(X_train,y_train,X_test):
    fs = SelectKBest(score_func=f_regression,k='all')
    fs.fit(X_train,y_train)
    #tranform train input data
    X_train_fs = fs.fit_transform(X_train,y_train)
    #transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs,X_test_fs,fs

def plot_feature_score(featureName,featureScore):
    featureScoreDf = pd.DataFrame({'Features':fs.feature_names_in_,'Score':fs.scores_})
    fig = plot.bar(featureScoreDf,y='Score',x='Features',text_auto='.2s')
    fig.show()


## Feature Selection On All features


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score

y = playerDf['points']
X = playerDf.drop(['points','field_goals','field_goal_attempts','points_produced'],axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

#feature selection
X_train_fs,X_test_fs,fs = feature_Selection_points(X_train,y_train,X_test)

plot_feature_score(fs.feature_names_in_,fs.scores_)
    
#fit model
model = LinearRegression()
model.fit(X_train_fs,y_train)

#evalue a model
y_predict = model.predict(X_test_fs)

# evaluate accuracy of model
print('MAE: %.3f' % mean_absolute_error(y_test, y_predict))
print('R2_Score',r2_score(y_test,y_predict))


MAE: 0.001
R2_Score 0.9999999999798239


## Heatmap to find corelation with target variable

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(50,50))
corr = playerDf.corr()
heatmap = sns.heatmap(corr[['points']].sort_values(by='points', ascending=False), vmin=-1, vmax=1,annot=True)
heatmap.set_title('Features Correlating with Points', fontdict={'fontsize':18}, pad=18);

# #corelation with output variable
# cor_target = abs(corr["points"])

# #selecting highly corelated features
# relevant_feature = cor_target[cor_target>0.5]
# print(relevant_feature)



## Drop features with low and high corelation

In [37]:
droped_features = ['points','field_goals','field_goal_attempts','points_produced','free_throw_attempts','free_throws'
                  ,'two_point_attempts','two_pointers','win_shares','minutes_played','turnover_percentage','offensive_rebound_percentage','height','free_throw_attempt_rate'
                   ,'conference','block_percentage','weight','player_id','team_abbreviation','total_rebound_percentage','season'
                   ,'three_point_attempt_rate']

# feature selection on new dataframe


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score

y = playerDf['points']
X = playerDf.drop(droped_features,axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

#feature selection
X_train_fs,X_test_fs,fs = feature_Selection_points(X_train,y_train,X_test)

plot_feature_score(fs.feature_names_in_,fs.scores_)
    
#fit model
model = LinearRegression()
model.fit(X_train_fs,y_train)

#evalue a model
y_predict = model.predict(X_test_fs)

# evaluate accuracy of model
print('MAE: %.3f' % mean_absolute_error(y_test, y_predict))
print('R2_Score',r2_score(y_test,y_predict))


MAE: 18.194
R2_Score 0.9714807503739485


## Random Forest Regressor


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
import plotly.express as plt
# import matplotlib.pyplot as plt

y = playerDf['points']
X = playerDf.drop(droped_features,axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

#scale train and test with standardscaler
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

#fit the dimension of the target array
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

#build model
forest = RandomForestRegressor()
forest.fit(X_train_std,y_train)

print(forest.score(X_test_std,y_test))

#evalute model
y_predict = forest.predict(X_test_std)


#calculate error
mae = mean_absolute_error(y_test,y_predict)
print('MAE: %.3f' % mae)

#feature importance
pd.DataFrame(zip(X_train.columns,abs(forest.feature_importances_)),columns=["feature","weight"]).sort_values("weight",ascending=False).reset_index(drop=True)

# plt.plot(X_test_std,y_predict,color='blue')
# plt.show()
fig = plt.bar(x=[i for i in range(len(forest.feature_importances_))],y=forest.feature_importances_,text=X_train.columns)
fig.show()


## Polynomial regression


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

y = playerDf['points']
X = playerDf.drop(droped_features,axis=1)

poly = PolynomialFeatures(degree=2,include_bias=False)
ploy_features = poly.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(ploy_features,y,test_size=0.33)

poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train,y_train)

y_predicted = poly_reg_model.predict(X_test)
print(y_predicted)

# evaluate predictions
mae = mean_absolute_error(y_test, y_predicted)
rmae = np.sqrt(mae)
print('MAE: %.3f' % mae)
print('RMAE: %.3f' % rmae)


## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

y = playerDf['points']
X = playerDf.drop(droped_features,axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train,y_train)

y_predicted = poly_reg_model.predict(X_test)
print(y_predicted)

# evaluate predictions
mae = mean_absolute_error(y_test, y_predicted)
rmae = np.sqrt(mae)
print('MAE: %.3f' % mae)
print('RMAE: %.3f' % rmae)



## drop minutes played with zero value and new feature pointsperminute

In [42]:
newPlayerDf = playerDf.copy()
newPlayerDf = newPlayerDf[newPlayerDf['minutes_played'] != 0]
newPlayerDf['pointsperminute'] = newPlayerDf['points']/newPlayerDf['minutes_played']
newPlayerDf.head(5)

Unnamed: 0.1,Unnamed: 0,player_id,team_abbreviation,season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,conference,...,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,weight,win_shares,win_shares_per_40_minutes,season_order,pointsperminute
0,0,522,325,15,0.0,0,0.0,0,-27.0,0,...,0.0,0,0.0,0,17.2,160.0,0.0,-0.25,0,0.0
1,1,522,325,16,0.0,0,0.0,0,47.8,0,...,0.0,1,1.0,1,52.6,160.0,0.0,1.42,1,2.0
2,0,1252,120,6,19.7,3,0.0,0,-11.4,34,...,6.0,4,0.5,2,24.6,170.0,-0.1,-0.075,0,0.225806
3,0,6839,258,9,1.5,1,3.4,4,-8.8,16,...,7.0,34,0.412,14,19.4,215.0,0.0,-0.005,0,0.261538
4,1,6839,258,10,3.0,2,3.2,4,-9.0,16,...,15.0,32,0.469,15,21.3,215.0,0.1,0.033,1,0.292208


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

y = newPlayerDf['points']
X = newPlayerDf.drop(droped_features,axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

#feature selection
X_train_fs,X_test_fs,fs = feature_Selection_points(X_train,y_train,X_test)

plot_feature_score(fs.feature_names_in_,fs.scores_)
    
#fit model
model = LinearRegression()
model.fit(X_train_fs,y_train)

#evalue a model
y_predict = model.predict(X_test_fs)

# evaluate accuracy of model
print('MAE: %.3f' % mean_absolute_error(y_test, y_predict))
print('R2_Score',r2_score(y_test,y_predict))


MAE: 17.829
R2_Score 0.9727851941759519


In [None]:
from datetime import datetime
from sportsipy.ncaab.boxscore import Boxscores,Boxscore,BoxscorePlayer
import pandas as pd
import numpy as np


# games = Boxscores(datetime(2019, 11, 11))
game_data = Boxscore('2019-11-11-21-saint-marys-ca')
df = game_data.dataframe
df.columns
players_list = [players for players in game_data.away_players]
player_list = [player for player in players_list]
df = []

for player in player_list:
    df.append(player.dataframe)
gamePlayerDf = pd.concat(df,ignore_index=True)

print('playersDF',gamePlayerDf)

gamePlayerDf.drop(['turnover_percentage','offensive_rebound_percentage','free_throw_attempt_rate'
                   ,'block_percentage','total_rebound_percentage'
                   ,'three_point_attempt_rate'],axis=1,inplace=True)

print(playerDf.columns.difference(gamePlayerDf.columns))
print(gamePlayerDf.columns.difference(playerDf.columns))
print('common cols',np.intersect1d(playerDf.columns,gamePlayerDf.columns))

# defensive_win_shares,games_played,games_started,offensive_win_shares,win_shares


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import plotly.express as plt

gamePlayerDf.fillna(value=0,inplace=True)


y_train = newPlayerDf['points']
X_train = newPlayerDf.drop(droped_features+['defensive_win_shares','games_played','games_started','offensive_win_shares',
                                          'win_shares','pointsperminute','box_plus_minus', 'defensive_box_plus_minus',
       'offensive_box_plus_minus', 'player_efficiency_rating', 'position',
       'season_order', 'win_shares_per_40_minutes'],axis=1)
X_train = X_train.iloc[: , 1:]

y_test = gamePlayerDf['points']
X_test = gamePlayerDf.drop(['points','defensive_rating', 'field_goal_attempts', 'field_goals',
       'free_throw_attempts', 'free_throws', 'minutes_played',
       'offensive_rating', 'two_point_attempts', 'two_pointers'],axis=1)

def feature_Selection_points(X_train,y_train,X_test):
    fs = SelectKBest(score_func=f_regression,k='all')
    fs.fit(X_train,y_train)
    #tranform train input data
    X_train_fs = fs.transform(X_train)
    #transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs,X_test_fs,fs

#feature selection
X_train_fs,X_test_fs,fs = feature_Selection_points(X_train,y_train,X_test)

#fit model
model = LinearRegression()
model.fit(X_train_fs,y_train)

#evalue a model
y_predict = model.predict(X_test_fs)

# evaluate predictions
mae = mean_absolute_error(y_test, y_predict)
print('MAE: %.3f' % mae)

# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature%d %s: %f' % (i, fs.feature_names_in_[i], fs.scores_[i]))
fig = plt.bar(x=[i for i in range(len(fs.scores_))],y=fs.scores_,text=fs.feature_names_in_)
fig.show()