In [2]:
import pandas as pd
import numpy as np
import importlib
import warnings
import zipfile

import make_data

importlib.reload(make_data)
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [3]:
zf = zipfile.ZipFile('../Data/all_season.zip')

bat = pd.read_csv(zf.open('all_season/all_season_batting_card.csv'))
bowl = pd.read_csv(zf.open("all_season/all_season_bowling_card.csv"))
full = pd.read_csv(zf.open("all_season/all_season_details.csv"))

# bat = pd.read_csv("../Data/all_season_batting_card.csv")
# bowl = pd.read_csv("../Data/all_season_bowling_card.csv")
# full = pd.read_csv("../Data/all_season_details.csv")

In [4]:
df = make_data.make_data(bat, bowl, full)
df = df.dropna()
df.head()

Unnamed: 0,season,match_id,batsman1_name,bowler1_name,home_team,away_team,current_innings,runs,ball,venue,...,avg_noballs,total_overs,total_maidens,total_conceded,total_wickets,total_dots,total_fours_c,total_sixes_c,total_wides,total_noballs
0,2008,335982,Ashley Noffke,Ajit Agarkar,RCB,KKR,RCB,2,6,"M.Chinnaswamy Stadium, Bengaluru",...,0.047619,130.2,0,1151,29,273,113,41,36,2
16,2008,335982,Balachandra Akhil,Ajit Agarkar,RCB,KKR,RCB,0,2,"M.Chinnaswamy Stadium, Bengaluru",...,0.047619,130.2,0,1151,29,273,113,41,36,2
32,2008,335982,Cameron White,Ajit Agarkar,RCB,KKR,RCB,3,6,"M.Chinnaswamy Stadium, Bengaluru",...,0.047619,130.2,0,1151,29,273,113,41,36,2
48,2011,501223,David Hussey,Ajit Agarkar,DC,KXIP,KXIP,4,2,"Arun Jaitley Stadium, Delhi",...,0.047619,130.2,0,1151,29,273,113,41,36,2
62,2008,335982,Jacques Kallis,Ajit Agarkar,RCB,KKR,RCB,7,4,"M.Chinnaswamy Stadium, Bengaluru",...,0.047619,130.2,0,1151,29,273,113,41,36,2


In [5]:
x = df.drop(columns = ["runs"])
y = df["runs"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [6]:
x_train.shape, x_test.shape

((29920, 40), (12824, 40))

In [7]:
categorical = x.dtypes[x.dtypes == "object"].index.tolist()
numerical = x.dtypes[x.dtypes != "object"].index.tolist()

In [8]:
num_pl = StandardScaler()
cat_pl = OneHotEncoder(handle_unknown = "ignore")

col_tr = ColumnTransformer([("categorical", cat_pl, categorical),
                            ("numerical", num_pl, numerical)])

x_train_tr = col_tr.fit_transform(x_train)
x_test_tr = col_tr.transform(x_test)

In [9]:
x_train_tr.shape, x_test_tr.shape

((29920, 2117), (12824, 2117))

In [9]:
lin_reg = LinearRegression()
lin_reg = lin_reg.fit(x_train_tr, y_train)

dtr = DecisionTreeRegressor()
dtr = dtr.fit(x_train_tr, y_train)

svr = SVR()
svr = svr.fit(x_train_tr, y_train)

In [None]:
lr_params = {"fit_intercept": [True, False], "normalize": [True, False]}
dtr_params = {
    "max_depth": [2, 3, 4, 5, 10, 20],
    "min_samples_split": [2, 3, 5, 10, 20],
    "min_samples_leaf": [2, 3, 5, 10, 20],
}

svr_params = {
    'kernel' : ('linear', 'poly', 'sigmoid'),
    'degree' : [3,8],
    'gamma' : ('auto','scale')}

opt_lr = GridSearchCV(LinearRegression(), lr_params)
opt_lr = opt_lr.fit(x_train_tr, y_train)

opt_dtr = GridSearchCV(DecisionTreeRegressor(), dtr_params)
opt_dtr = opt_dtr.fit(x_train_tr, y_train)

opt_svr = GridSearchCV(SVR(), svr_params, verbose = 2)
opt_svr = opt_svr.fit(x_train_tr, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ....coef0=0.01, degree=3, gamma=auto, kernel=linear; total time= 2.7min
[CV] END ....coef0=0.01, degree=3, gamma=auto, kernel=linear; total time= 2.8min
[CV] END ....coef0=0.01, degree=3, gamma=auto, kernel=linear; total time= 2.6min
[CV] END ....coef0=0.01, degree=3, gamma=auto, kernel=linear; total time= 2.7min
[CV] END ....coef0=0.01, degree=3, gamma=auto, kernel=linear; total time= 2.6min
[CV] END ......coef0=0.01, degree=3, gamma=auto, kernel=poly; total time= 1.1min
[CV] END ......coef0=0.01, degree=3, gamma=auto, kernel=poly; total time= 1.2min
[CV] END ......coef0=0.01, degree=3, gamma=auto, kernel=poly; total time= 1.2min
[CV] END ......coef0=0.01, degree=3, gamma=auto, kernel=poly; total time= 1.1min
[CV] END ......coef0=0.01, degree=3, gamma=auto, kernel=poly; total time= 1.2min
[CV] END ...coef0=0.01, degree=3, gamma=auto, kernel=sigmoid; total time= 1.1min
[CV] END ...coef0=0.01, degree=3, gamma=auto, k

In [11]:
opt_svr = SVR(kernel = "linear", gamma = "scale")
opt_svr = opt_svr.fit(x_train_tr, y_train)
opt_svr.score(x_test_tr, y_test)

0.5647152158220522

In [12]:
lr_pred = opt_lr.predict(x_test_tr)
dtr_pred = opt_dtr.predict(x_test_tr)
svr_pred = opt_svr.predict(x_test_tr)

data = {
    "mean_sq": [
        mean_squared_error(y_test, lin_reg.predict(x_test_tr)),
        mean_squared_error(y_test, dtr.predict(x_test_tr)),
        mean_squared_error(y_test, svr.predict(x_test_tr))],
    "r2": [
        lin_reg.score(x_test_tr, y_test),
        dtr.score(x_test_tr, y_test),
        svr.score(x_test_tr, y_test)],
    "opt_mean_sq": [
        mean_squared_error(y_test, lr_pred),
        mean_squared_error(y_test, dtr_pred),
        mean_squared_error(y_test, svr_pred)],
    "opt_r2": [
        opt_lr.score(x_test_tr, y_test),
        opt_dtr.score(x_test_tr, y_test),
        opt_svr.score(x_test_tr, y_test)]
    
    }

evaluate_df = pd.DataFrame(data, index = ["linear regression", "decision tree", "svr"])

NameError: name 'opt_lr' is not defined

In [None]:
evaluate_df