<a href="https://colab.research.google.com/github/dimitramuni/NBA-Player-Profile-analysis/blob/main/Script/Regression_and_Gradient_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# to mount the drive into the system 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, normalize
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
class Regression_GradientBoost:
  '''
  This class is used for Regression and Gradient Boosting.
  i) method: regression(), can be utilised to visualise 
             score difference versus salary ratio between 
             Portland and Opponent team.
  ii) method: gradient_boost(), can be utilised
             to predict game result for Portland
  
  '''

  def __init__(self,df_season,random_seed=1728,test_split=0.2):

    '''
    df_season features for season 1999 - 2000:

    ['Date', 'Sep', 'Opp', 'Result', 'TmScore', 
     'OppScore', 'Attendance','match number', 
     'Portland defensive', 'Portland offensive',
     'Opponent defensive', 'Opponent offensive',
     'Home_cluster1','Home_cluster2', 'Home_cluster3', 
     'Home_cluster4', 'Home_cluster5','Home_cluster6', 
     'Home_cluster7', 'Home_cluster8', 'opponent_cluster1',
     'opponent_cluster2', 'opponent_cluster3', 
     'opponent_cluster4','opponent_cluster5',
     'opponent_cluster6', 'opponent_cluster7',
     'opponent_cluster8']


    df_season features for season 2003 - 2004:
    ['Date', 'Sep', 'Opp', 'Result', 'TmScore', 'OppScore',
     'Attendance','match number', 'Portland defensive',
     'Portland offensive','Opponent defensive', 
     'Opponent offensive', 'Home_cluster1',
     'Home_cluster2', 'Home_cluster3', 'Home_cluster4',
     'Home_cluster5','Home_cluster6', 'Home_cluster7', 
     'Home_cluster8', 'Home_cluster9',
     'opponent_cluster1', 'opponent_cluster2', 
     'opponent_cluster3','opponent_cluster4', 
     'opponent_cluster5', 
     'opponent_cluster6','opponent_cluster7', 
     'opponent_cluster8', 'opponent_cluster9'] 
    '''
    
    #new column indicating Home or Away game 
    df_season['HomeRoad']=np.where\
    (df_season['Sep']=='@','Road','Home')
    
    
    #converting Date column into pandas datetime 
    df_season['Date']=pd.to_datetime(df_season['Date'])

    #creating a new column indicating the 
    #difference in score for Portland against a specific team
    df_season['DiffScore']=df_season['TmScore']\
    -df_season['OppScore']
    
    #calculating difference in overall 
    #offensive and defensive salary
    df_season['diff_salary']=\
    df_season['Portland offensive']+\
    df_season['Opponent offensive']-\
    df_season['Portland defensive']-\
    df_season['Opponent defensive']
    
    
    #creating a new column which tracks the 
    #result for previous game, for the first
    #row in the Previous_Result column,
    #value will be NAN
    df_season['Previous_Result']\
    =df_season['Result'].shift(1) 
    

    #One Hot Encoding of Previous_Result column
    df_season['Prev_Win']=\
    np.where(df_season['Previous_Result']=='W',1,0)

    #One Hot Encoding of HomeRoad game column
    df_season['HomeGame']=\
    np.where(df_season['HomeRoad']=='Home',1,0)


    #Finding all indices of home and 
    #away games in the dataframe
    self.home_inds=\
    np.where(df_season['HomeRoad']=='Home')
    self.road_inds=\
    np.where(df_season['HomeRoad']=='Road')

    #creating a variable inside 
    #the class for further methods
    home_gamelog=df_season.iloc[self.home_inds]
    road_gamelog=df_season.iloc[self.road_inds]

    #assigninment of public variables
    self.df_home_gamelog=home_gamelog
    self.df_road_gamelog=road_gamelog
    self.df_season=df_season
    self.random_seed=random_seed
    self.test_split=test_split
      
  def regression(self,location='Home'):
     

     if (location=='Home'):
       portland_salary=\
       self.df_home_gamelog['Portland offensive']+\
       self.df_home_gamelog['Portland defensive']
    
       
       opponent_salary=\
       self.df_home_gamelog['Opponent defensive']+\
       self.df_home_gamelog['Opponent offensive']
       
       
       y=self.df_home_gamelog[['DiffScore']]
     
     if (location=='Road'):
       portland_salary=\
       self.df_road_gamelog['Portland offensive']+\
       self.df_road_gamelog['Portland defensive']

       opponent_salary=\
       self.df_road_gamelog['Opponent defensive']+\
       self.df_road_gamelog['Opponent offensive']

       y=self.df_road_gamelog[['DiffScore']]
      
     #finding log difference of 
     #Portland and Opponent salary 
     #using reshape(-1,1) to convert 
     #X from 1D to 2D array
     #assigning a temporary variable 
     #for readability
     dummy=np.log(portland_salary)-np.log(opponent_salary)
     X=np.asarray(dummy).reshape(-1,1)
  
     
     
     #fitting linear regression model
     linmodel=LinearRegression().fit(X, y)
     #prediction 
     y_pred=linmodel.predict(X)

     #plotting score diff vs. salary
     plt.figure(figsize=(12,8))
     plt.grid(True)
     plt.scatter(np.exp(X),y)
     plt.scatter(np.exp(X),y_pred)
     plt.legend(['Data Points','Linear Predictions'])
     plt.title('Portland Trail Blazers '+\
               location +\
               ' Games, Score Difference\
                versus Salary Budget')
     plt.xlabel('Ratio of Portland and Opponent Salaries')
     plt.ylabel('Score Difference')
   

  def gradient_boost(self):


    #creating a list which will drop
    #the features which are not 
    #considered for gradient boost
    features_drop_list={'Date', 'Sep', 'Opp',
                        'TmScore', 'OppScore',
                        'Portland defensive',
                        'Portland offensive',
                        'Opponent defensive', 
                        'Opponent offensive',
                        'DiffScore','HomeRoad',
                        'Result','Previous_Result'}

    #performing A - B, set operation
    features_selected=\
    set(self.df_season.columns)-\
    features_drop_list
   
   
    #assigning subset of original
    # dataframe to X
    X=self.df_season[features_selected]

    #converting result from df_season to array
    ytemp=np.asarray(self.df_season[['Result']])
    #reshaping array to conver into 1D array
    y=ytemp.reshape((self.df_season.shape[0],))
    print(y)
    #splitting train and test data set 
    n_train=70
    #for matches 72 to 82
    game_pred=[]
    game_original_result=[]
    game_no=[]
    for i_matches in range(n_train,81):

      
      Xtrain=X.iloc[:i_matches].drop('match number',axis=1)
      ytrain=y[:i_matches]    
    
      Xtest=np.asarray(X.drop('match number',axis=1).iloc[i_matches+1]).reshape((1,-1))
      ytest=y[i_matches+1]


      #creating instance of GradientBoost Classfier
      GBClassifier=\
      GradientBoostingClassifier(random_state=self.random_seed)
      #fitting the model
      GBClassifier.fit(Xtrain,ytrain)
      #prediction on test set
      ypred=GBClassifier.predict(Xtest)
      game_original_result.append(ytest)
      game_pred.append(ypred)
      game_no.append(i_matches+2)

      #print('game number\n',temp)
      #print('original results\n',ytest)
      #print('predicted results\n',ypred)
    
    print('Confusion Matrix\n',confusion_matrix(game_original_result,game_pred))
    print(classification_report(y_true=game_original_result,y_pred=game_pred))

    df= pd.DataFrame(list(zip(game_no,game_original_result,game_pred)),columns=['Game','Actual','Predicted'])
    return df.sort_values(by='Game')

# Season 1999 - 2000

In [None]:
## Season 1999 - 2000 gamelog
df_gamelog=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/Portland_gamelog_1999_2000.csv')
df_gamelog['match number']=[i+1 for i in range(df_gamelog.shape[0])]

#reading Home and Road game salaries
df_HG_salary=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/homegame1999_salary.csv').drop('Unnamed: 0',axis=1)
df_RG_salary=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/awaygame1999_salary.csv').drop('Unnamed: 0',axis=1)

#reading SPM dataframe
df_SPM=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/SPMplayers1999.csv').drop('Unnamed: 0',axis=1)
df_SPM['match number']=[i+1 for i in range(df_SPM.shape[0])]

#joining home and road game salaries and creating a combined dataframe
df_salary=df_HG_salary.append(df_RG_salary).sort_values(by='match number')
#temporary dataframe joining gamelog with salary combined dataframe
temp_df=pd.merge(df_gamelog, df_salary, on='match number', how = "inner")
#creating a finale dataframe by merging temporary dataframe with SPM dataframe
season_df=pd.merge(temp_df,df_SPM,on='match number', how = "inner")

print(season_df.columns)

obj=Regression_GradientBoost(season_df,random_seed=1728,test_split=0.20)
#obj.regression('Home')
#obj.regression('Road')
obj.gradient_boost()

Index(['Date', 'Sep', 'Opp', 'Result', 'TmScore', 'OppScore', 'Attendance',
       'match number', 'Portland defensive', 'Portland offensive',
       'Opponent defensive', 'Opponent offensive', 'Home_cluster1',
       'Home_cluster2', 'Home_cluster3', 'Home_cluster4', 'Home_cluster5',
       'Home_cluster6', 'Home_cluster7', 'Home_cluster8', 'opponent_cluster1',
       'opponent_cluster2', 'opponent_cluster3', 'opponent_cluster4',
       'opponent_cluster5', 'opponent_cluster6', 'opponent_cluster7',
       'opponent_cluster8'],
      dtype='object')
['W' 'W' 'W' 'W' 'L' 'W' 'W' 'W' 'W' 'W' 'W' 'L' 'W' 'W' 'W' 'L' 'W' 'L'
 'W' 'L' 'W' 'W' 'W' 'L' 'L' 'W' 'W' 'W' 'W' 'W' 'W' 'L' 'W' 'W' 'W' 'W'
 'L' 'L' 'W' 'W' 'W' 'W' 'W' 'W' 'L' 'W' 'W' 'W' 'W' 'W' 'W' 'W' 'W' 'W'
 'W' 'W' 'L' 'W' 'L' 'L' 'L' 'W' 'W' 'W' 'L' 'W' 'L' 'W' 'L' 'L' 'W' 'W'
 'W' 'L' 'L' 'W' 'W' 'L' 'W' 'W' 'W' 'L']
Confusion Matrix
 [[2 2]
 [0 7]]
              precision    recall  f1-score   support

           L       1.0

Unnamed: 0,Game,Actual,Predicted
0,72,W,[W]
1,73,W,[W]
2,74,L,[L]
3,75,L,[W]
4,76,W,[W]
5,77,W,[W]
6,78,L,[L]
7,79,W,[W]
8,80,W,[W]
9,81,W,[W]


# Season 2003 - 2004

In [None]:
## Season 2003 - 2004 gamelog
df_gamelog=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/Portland_gamelog_2003_2004.csv')
df_gamelog['match number']=[i+1 for i in range(df_gamelog.shape[0])]

#reading Home and Road game salaries
df_HG_salary=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/homegame2003_salary.csv').drop('Unnamed: 0',axis=1)
df_RG_salary=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/awaygame2003_salary.csv').drop('Unnamed: 0',axis=1)

#reading SPM dataframe
df_SPM=pd.read_csv('/content/drive/MyDrive/SportAnalytics/gradientBoost/SPMplayers2003alt.csv').drop('Unnamed: 0',axis=1)
df_SPM['match number']=[i+1 for i in range(df_SPM.shape[0])]

#joining home and road game salaries and creating a combined dataframe
df_salary=df_HG_salary.append(df_RG_salary).sort_values(by='match number')
#temporary dataframe joining gamelog with salary combined dataframe
temp_df=pd.merge(df_gamelog, df_salary, on='match number', how = "inner")
#creating a finale dataframe by merging temporary dataframe with SPM dataframe
season_df=pd.merge(temp_df,df_SPM,on='match number', how = "inner")


print(season_df.columns)

obj=Regression_GradientBoost(season_df,random_seed=1728,test_split=0.20)
#obj.regression('Home')
#obj.regression('Road')
obj.gradient_boost()

Index(['Date', 'Sep', 'Opp', 'Result', 'TmScore', 'OppScore', 'Attendance',
       'match number', 'Portland defensive', 'Portland offensive',
       'Opponent defensive', 'Opponent offensive', 'Home_cluster1',
       'Home_cluster2', 'Home_cluster3', 'Home_cluster4', 'Home_cluster5',
       'Home_cluster6', 'Home_cluster7', 'Home_cluster8', 'Home_cluster9',
       'opponent_cluster1', 'opponent_cluster2', 'opponent_cluster3',
       'opponent_cluster4', 'opponent_cluster5', 'opponent_cluster6',
       'opponent_cluster7', 'opponent_cluster8', 'opponent_cluster9'],
      dtype='object')
['L' 'W' 'L' 'W' 'L' 'W' 'W' 'W' 'L' 'L' 'W' 'W' 'L' 'L' 'W' 'W' 'W' 'L'
 'L' 'L' 'W' 'L' 'W' 'L' 'L' 'W' 'W' 'L' 'W' 'L' 'L' 'L' 'L' 'W' 'L' 'L'
 'L' 'L' 'L' 'W' 'L' 'W' 'W' 'W' 'W' 'L' 'W' 'W' 'W' 'L' 'L' 'L' 'W' 'W'
 'W' 'W' 'W' 'L' 'L' 'L' 'L' 'W' 'L' 'W' 'W' 'W' 'W' 'L' 'W' 'L' 'W' 'W'
 'L' 'W' 'W' 'L' 'W' 'W' 'L' 'L' 'L' 'L']
Confusion Matrix
 [[3 3]
 [2 3]]
              precision    recall  f1-s

Unnamed: 0,Game,Actual,Predicted
0,72,W,[L]
1,73,L,[W]
2,74,W,[W]
3,75,W,[L]
4,76,L,[W]
5,77,W,[W]
6,78,W,[W]
7,79,L,[L]
8,80,L,[W]
9,81,L,[L]
