In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

In [3]:
import pickle

In [4]:
final_df = pd.read_csv('final.csv',index_col=[0])

In [5]:
final_df = final_df.sample(final_df.shape[0])

In [6]:
eligible_cities = final_df['city'].value_counts()[final_df['city'].value_counts() > 600].index.tolist()

final_df = final_df[final_df['city'].isin(eligible_cities)]

In [7]:
X = final_df.drop(columns=['match_id','runs_x','runs_y'])
y = final_df['runs_y']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [46]:
parameters = {
    'step3__max_depth': [14,15,16],
    'step3__n_estimators': [1400,1500,1600],
    'step3__learning_rate': [0.09,0.1,0.11]
}

In [26]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [54]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1600,learning_rate=0.1,max_depth=16,random_state=42))
])

In [47]:
from sklearn.model_selection import RandomizedSearchCV
random = RandomizedSearchCV(pipe,param_distributions=parameters,n_iter=10,scoring='accuracy',n_jobs=-1,cv=5,verbose=3)

In [48]:
random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [49]:
random.best_params_

{'step3__n_estimators': 1600,
 'step3__max_depth': 16,
 'step3__learning_rate': 0.11}

In [55]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))



0.9589984896868727
4.354025750489678


In [27]:
pickle.dump(pipe,open('model.pkl','wb'))