In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import sklearn.metrics as skm
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv(r"IPL_2013.csv")
df

Unnamed: 0,Sl.NO.,PLAYER NAME,AGE,COUNTRY,TEAM,PLAYING ROLE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,...,SR-B,SIXERS,RUNS-C,WKTS,AVE-BL,ECON,SR-BL,AUCTION YEAR,BASE PRICE,SOLD PRICE
0,1,"Abdulla, YA",2,SA,KXIP,Allrounder,0,0,0,0.00,...,0.00,0,307,15,20.47,8.90,13.93,2009,50000,50000
1,2,Abdur Razzak,2,BAN,RCB,Bowler,214,18,657,71.41,...,0.00,0,29,0,0.00,14.50,0.00,2008,50000,50000
2,3,"Agarkar, AB",2,IND,KKR,Bowler,571,58,1269,80.62,...,121.01,5,1059,29,36.52,8.81,24.90,2008,200000,350000
3,4,"Ashwin, R",1,IND,CSK,Bowler,284,31,241,84.56,...,76.32,0,1125,49,22.96,6.23,22.14,2011,100000,850000
4,5,"Badrinath, S",2,IND,CSK,Batsman,63,0,79,45.93,...,120.71,28,0,0,0.00,0.00,0.00,2011,100000,800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,126,"Yadav, AS",2,IND,DC,Batsman,0,0,0,0.00,...,125.64,2,0,0,0.00,0.00,0.00,2010,50000,750000
126,127,Younis Khan,2,PAK,RR,Batsman,6398,7,6814,75.78,...,42.85,0,0,0,0.00,0.00,0.00,2008,225000,225000
127,128,Yuvraj Singh,2,IND,KXIP+,Batsman,1775,9,8051,87.58,...,131.88,67,569,23,24.74,7.02,21.13,2011,400000,1800000
128,129,Zaheer Khan,2,IND,MI+,Bowler,1114,288,790,73.55,...,91.67,1,1783,65,27.43,7.75,21.26,2008,200000,450000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   PLAYER NAME    130 non-null    object 
 2   AGE            130 non-null    int64  
 3   COUNTRY        130 non-null    object 
 4   TEAM           130 non-null    object 
 5   PLAYING ROLE   130 non-null    object 
 6   T-RUNS         130 non-null    int64  
 7   T-WKTS         130 non-null    int64  
 8   ODI-RUNS-S     130 non-null    int64  
 9   ODI-SR-B       130 non-null    float64
 10  ODI-WKTS       130 non-null    int64  
 11  ODI-SR-BL      130 non-null    float64
 12  CAPTAINCY EXP  130 non-null    int64  
 13  RUNS-S         130 non-null    int64  
 14  HS             130 non-null    int64  
 15  AVE            130 non-null    float64
 16  SR-B           130 non-null    float64
 17  SIXERS         130 non-null    int64  
 18  RUNS-C    

In [4]:
df.drop('PLAYER NAME',axis=1,inplace=True)

In [5]:
df['TEAM']=df['TEAM'].str.replace('+','')
df['TEAM']

0      KXIP
1       RCB
2       KKR
3       CSK
4       CSK
       ... 
125      DC
126      RR
127    KXIP
128      MI
129      DC
Name: TEAM, Length: 130, dtype: object

In [6]:
df['TEAM'].value_counts().sum

<bound method Series.sum of TEAM
RCB     21
CSK     19
KKR     17
DC      17
DD      16
RR      15
KXIP    12
MI      12
KXI      1
Name: count, dtype: int64>

In [7]:
df['PLAYING ROLE'].value_counts().sum

<bound method Series.sum of PLAYING ROLE
Bowler        44
Batsman       39
Allrounder    35
W. Keeper     12
Name: count, dtype: int64>

In [8]:
df['COUNTRY'].value_counts().sum

<bound method Series.sum of COUNTRY
IND    53
AUS    22
SA     16
SL     12
PAK     9
NZ      7
WI      6
ENG     3
BAN     1
ZIM     1
Name: count, dtype: int64>

In [9]:
categorical_columns=['COUNTRY','TEAM','PLAYING ROLE']

for i in categorical_columns:
 cat_df=pd.get_dummies(df[i],dtype=int,drop_first=True)
 df=pd.merge(left=df,right=cat_df,right_index=True,left_index=True)
 df.drop(columns=i,inplace=True)
df

Unnamed: 0,Sl.NO.,AGE,T-RUNS,T-WKTS,ODI-RUNS-S,ODI-SR-B,ODI-WKTS,ODI-SR-BL,CAPTAINCY EXP,RUNS-S,...,DD,KKR,KXI,KXIP,MI,RCB,RR,Batsman,Bowler,W. Keeper
0,1,2,0,0,0,0.00,0,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,2,214,18,657,71.41,185,37.6,0,0,...,0,0,0,0,0,1,0,0,1,0
2,3,2,571,58,1269,80.62,288,32.9,0,167,...,0,1,0,0,0,0,0,0,1,0
3,4,1,284,31,241,84.56,51,36.8,0,58,...,0,0,0,0,0,0,0,0,1,0
4,5,2,63,0,79,45.93,0,0.0,0,1317,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,126,2,0,0,0,0.00,0,0.0,0,49,...,0,0,0,0,0,0,0,1,0,0
126,127,2,6398,7,6814,75.78,3,86.6,1,3,...,0,0,0,0,0,0,1,1,0,0
127,128,2,1775,9,8051,87.58,109,44.3,1,1237,...,0,0,0,1,0,0,0,1,0,0
128,129,2,1114,288,790,73.55,278,35.4,0,99,...,0,0,0,0,1,0,0,0,1,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 42 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sl.NO.         130 non-null    int64  
 1   AGE            130 non-null    int64  
 2   T-RUNS         130 non-null    int64  
 3   T-WKTS         130 non-null    int64  
 4   ODI-RUNS-S     130 non-null    int64  
 5   ODI-SR-B       130 non-null    float64
 6   ODI-WKTS       130 non-null    int64  
 7   ODI-SR-BL      130 non-null    float64
 8   CAPTAINCY EXP  130 non-null    int64  
 9   RUNS-S         130 non-null    int64  
 10  HS             130 non-null    int64  
 11  AVE            130 non-null    float64
 12  SR-B           130 non-null    float64
 13  SIXERS         130 non-null    int64  
 14  RUNS-C         130 non-null    int64  
 15  WKTS           130 non-null    int64  
 16  AVE-BL         130 non-null    float64
 17  ECON           130 non-null    float64
 18  SR-BL     

In [11]:
df.columns

Index(['Sl.NO.', 'AGE', 'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B',
       'ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B',
       'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR',
       'BASE PRICE', 'SOLD PRICE', 'BAN', 'ENG', 'IND', 'NZ', 'PAK', 'SA',
       'SL', 'WI', 'ZIM', 'DC', 'DD', 'KKR', 'KXI', 'KXIP', 'MI', 'RCB', 'RR',
       'Batsman', 'Bowler', 'W. Keeper'],
      dtype='object')

In [12]:
X_df=df.drop(['SOLD PRICE'],axis=1)
X_features=X_df.columns
X_features

Index(['Sl.NO.', 'AGE', 'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B',
       'ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S', 'HS', 'AVE', 'SR-B',
       'SIXERS', 'RUNS-C', 'WKTS', 'AVE-BL', 'ECON', 'SR-BL', 'AUCTION YEAR',
       'BASE PRICE', 'BAN', 'ENG', 'IND', 'NZ', 'PAK', 'SA', 'SL', 'WI', 'ZIM',
       'DC', 'DD', 'KKR', 'KXI', 'KXIP', 'MI', 'RCB', 'RR', 'Batsman',
       'Bowler', 'W. Keeper'],
      dtype='object')

In [13]:
x=sm.add_constant(df[X_features])
y=df[['SOLD PRICE']]

In [14]:
# NORMALISING THE DATA VALUES
x_scaled=preprocessing.scale(x)
x_scaled=pd.DataFrame(x_scaled,columns=x.columns)
y_scaled=preprocessing.scale(y)
y_scaled=pd.DataFrame(y_scaled,columns=y.columns)

In [15]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,train_size=0.70)

In [16]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('Y_train',Y_train.shape)
print('Y_test',Y_test.shape)

X_train (91, 42)
X_test (39, 42)
Y_train (91, 1)
Y_test (39, 1)


# Linear Model

In [17]:
LR=LinearRegression()
LR.fit(X_train,Y_train)

In [20]:
ytest_pred=LR.predict(X_test)
skm.r2_score(y_true=Y_test,y_pred=ytest_pred)

-0.6439023422236512

In [19]:
y_pred=LR.predict(X_train)
skm.r2_score(y_true=Y_train,y_pred=y_pred)

0.6454097477222482

# Ridge Reg L2 Regularization


In [23]:
from sklearn.linear_model import RidgeCV
ridge_cv=RidgeCV(alphas=[0.01,0.1,10,1,25,5,0.5,15,50,0.051]).fit(X_train,Y_train)
print('The train r2 score for ridge model is {}'.format(ridge_cv.score(X_train,Y_train))),
print('The test r2 score for ridge model is {}'.format(ridge_cv.score(X_test,Y_test))) 

The train r2 score for ridge model is 0.6874677865714958
The test r2 score for ridge model is -0.5222408410354953


# Lasso Reg L1 Regularization


In [24]:
lasso_cv=LassoCV(alphas=[600,900,200,1100,0.1,10,1500,1800,2000,1700]).fit(X_train,Y_train)
print('The train score for lasso model is {}'.format(lasso_cv.score(X_train,Y_train)))
print('The test score for ridge model is {}'.format(lasso_cv.score(X_test,Y_test)))

The train score for lasso model is 0.7442235298299242
The test score for ridge model is -0.9312561758758338


# Elastic Net Reg 


In [25]:
enet_cv=ElasticNetCV(alphas=[600,1100,0.1,10,30,1,0,10,0.001,2000]).fit(X_train,Y_train)
print('The train score for enet model is {}'.format(enet_cv.score(X_train,Y_train)))
print('The test score for enet model is {}'.format(enet_cv.score(X_test,Y_test)))

The train score for enet model is 0.5859992662105487
The test score for enet model is -0.11652730016054713


### Getting coefficients of the features from each of the models and comparing

In [26]:
coefdf=pd.DataFrame()
coefdf['Columns']=x.columns
a=np.array(LR.coef_)
a.resize((42,1),refcheck=False)
coefdf['LR coefficient']=a
b=np.array(ridge_cv.coef_)
b.resize((42,1),refcheck=False)
coefdf['Ridge coefficient']=b
c=np.array(lasso_cv.coef_)
c.resize((42,1),refcheck=False)
coefdf['Lasso coefficient']=c
coefdf['Elastic Net coefficient']=enet_cv.coef_
coefdf

Unnamed: 0,Columns,LR coefficient,Ridge coefficient,Lasso coefficient,Elastic Net coefficient
0,const,-240520300000000.0,0.0,0.0,0.0
1,Sl.NO.,1914.144,2191.957571,2159.786528,954.496472
2,AGE,57924.12,-6125.770555,-4304.075698,-16.210571
3,T-RUNS,-62.07504,-17.527615,-16.444377,-42.870295
4,T-WKTS,-453.2174,-443.702725,-559.493055,-144.145608
5,ODI-RUNS-S,20.55964,3.000591,-11.113152,26.004757
6,ODI-SR-B,-717.1773,-1163.200135,-576.590926,-425.699171
7,ODI-WKTS,46.23093,385.043115,711.374564,-61.579857
8,ODI-SR-BL,-1789.719,-794.156464,-1064.323448,-23.150032
9,CAPTAINCY EXP,249842.4,20000.719323,180488.264468,11.090503
