In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
delivery_df = pd.read_csv(r'D:\Data Science\Projects\Cricket t20\Data\Prepared\prepared.csv')

In [None]:
final_df = delivery_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_run_x','crr','rrr','result']]

final_df['rrr'] = final_df['rrr'].round(2)
final_df['crr'] = final_df['crr'].round(2)

In [None]:
final_df2 = final_df.copy()

In [None]:
final_df2.head(3)

In [None]:
cols = ['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left',
       'wickets', 'total_run_x', 'crr', 'rrr']

target = ['result']

In [None]:
X = final_df2[cols]

In [None]:
X_train = final_df2.loc[delivery_df['date']<'2022-08-01'][cols]
X_test = final_df2.loc[delivery_df['date']>='2022-08-01'][cols]

y_train = final_df2.loc[delivery_df['date']<'2022-08-01']['result']
y_test =  final_df2.loc[delivery_df['date']>='2022-08-01']['result']


In [None]:
print('Records in: \n X_train: {0} \n y_train: {1} \n X_test: {2} \n y_test: {3}'.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

## ENCODING

ordinal_encoder = OrdinalEncoder()
ordinal_encoder_train_test = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan)

cat_vars = [c for c in X.columns if X.loc[~X[c].isnull(),c].dtype=='object']

X_train[cat_vars] = X_train[cat_vars].fillna("UNKNOWN")
X_test[cat_vars] = X_test[cat_vars].fillna("UNKNOWN")

# X_encoded = X.copy()
# X_encoded[cat_vars] = ordinal_encoder.fit_transform(X_encoded[cat_vars])

### Train test encoding
X_train_encoded = X_train.copy()
X_train_encoded[cat_vars] = ordinal_encoder_train_test.fit_transform(X_train_encoded[cat_vars])

X_test_encoded = X_test.copy()
X_test_encoded[cat_vars] = ordinal_encoder_train_test.transform(X_test_encoded[cat_vars])

print('Filling missing values')
temp = X_test_encoded.columns[X_test_encoded.isna().any()].tolist()
if len(temp) != 0: 
    temp = list(set(cat_vars).intersection(set(list(temp)))) 
    for t in tqdm(temp): 
        i = list(cat_vars).index(t) 
        X_test_encoded[t] = X_test_encoded[t].fillna(len(ordinal_encoder_train_test.categories_[i])) 

        
# X_test_encoded.dropna(inplace=True)
y_test = y_test[y_test.index.isin(X_test_encoded.index)]


In [None]:
X_train.head(1)

In [None]:
#model_names = ['Linear', 'Ridge','Lasso','Elastic','Stohastic Gradient Descent','Random Forest','XGB','SVR']
model_names = ['LR']
models = [
            LogisticRegression(solver='liblinear'),
            #Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=0),
            #Lasso(alpha=0.1, precompute=True, warm_start=True, positive=True, selection='random', random_state=0),
            #ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=0),
            #SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=100000),
            #RandomForestClassifier(random_state=0),
            #XGBRegressor(random_state=0),
            #SVR(kernel='rbf', C=1000000, epsilon=0.001)
       
         ]

In [None]:
#def predictions(train,target,model)
results = pd.DataFrame()
all_preds = pd.DataFrame()

for i in range(len(model_names)):
    model1 = models[i].fit(X_train_encoded,y_train)
    pred = model1.predict(X_test_encoded)
    accuracy =  metrics.accuracy_score(y_test, pred)
    precision = metrics.precision_score(y_test, pred)
    recall = np.sqrt(metrics.recall_score(y_test, pred))
    f1_score = metrics.f1_score(y_test,pred).round(2)
    r2_square = metrics.r2_score(y_test, pred).round(2)
    
    Model_accuracies = pd.DataFrame({'model_name':model_names[i], 'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1_score':f1_score},index = np.arange(1))
    Model_accuracies
    
    predictions = pd.DataFrame({model_names[i]:pred}).reset_index(drop=True)
    all_preds = pd.concat([all_preds,predictions],axis=1)
    
    results = results.append(Model_accuracies)
    results = results.reset_index(drop=True)
    


In [None]:
results

In [None]:

filename = 'LR_prob.sav'
#pickle.dump(model1, open('D:/Data Science/Projects/Cricket t20/Data/Analysis/Predictions/Models/Trial ' + str(trial_model) + filename, 'wb'))



In [None]:
trial_model = 3

In [None]:
##### SAVE TRAINED MODEL AND RELATED FILES
import os
import pickle
import joblib

trail_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}'.format(trial_model)

save_trained_column_names_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/trained_column_names.npy'.format(trial_model)
save_class_names_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/classes_names.npy'.format(trial_model)
save_classes_encoding_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/classes_encoding.npy'.format(trial_model)
#save_feature_scaler_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/feature_scaler.pkl'.format(trial_model)
#save_scaled_features_path = r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/scaled_features.npy'.format(trial_model)


if os.path.exists(trail_path):
    print('File Exists!! Change File Name.')
else:
    os.makedirs(trail_path)
    
    for m in model_names:
        print(m)
        joblib.dump(model1,r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Models/Trial {0}/model_{1}.pkl'.format(trial_model,m), compress=1)
    np.save(save_trained_column_names_path,cols)
    np.save(save_class_names_path, cat_vars)
    np.save(save_classes_encoding_path, ordinal_encoder_train_test.categories_)
    #joblib.dump(scaler,save_feature_scaler_path, compress=1)
    #np.save(save_scaled_features_path, to_scale)

In [None]:
# get importance
importance = model1.coef_[0]

In [None]:
features = ['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_run_x','crr','rrr']

In [None]:
var_imp_df = pd.DataFrame({'var_name': features,
                   'importance': importance*100}).sort_values(by='importance',ascending=False)

var_imp_df.head(10)

In [None]:
results

In [None]:
test_all = delivery_df.loc[delivery_df['date']>='2022-08-01'].reset_index(drop=False)
test = final_df2.loc[delivery_df['date']>='2022-08-01'].reset_index(drop=False)

In [None]:
num = 919
print(test.iloc[num])

print('################################################################ \n')

print(model1.predict_proba(X_test_encoded)[num])

In [None]:
pred_prob = model1.predict_proba(X_test_encoded)
pred_prob_df = pd.DataFrame(pred_prob,columns=['bowling_team_pred','batting_team_pred'])

In [None]:
predictions = pd.DataFrame({'predictions':pred}).reset_index()
test_all = pd.concat([test_all,predictions,pred_prob_df],axis=1)

In [None]:
test_all.to_clipboard()

In [None]:
test_all.head(2)

In [None]:
len(test_all['mergeid'].unique())

In [None]:
#accuracy of last 5 overs
final_df = pd.DataFrame()

for i in test_all['mergeid'].unique():
    temp = test_all[test_all['mergeid'] == i]
    temp = temp.tail(30)
    final_df = pd.concat([final_df,temp],axis=0)
    
    
#test_all[['result','predictions']]
metrics.accuracy_score(final_df['result'], final_df['predictions'])    

In [None]:
#accuracy of each macth
final_df = pd.DataFrame()

for i in test_all['mergeid'].unique():
    temp = test_all[test_all['mergeid'] == i]
    print(temp['batting_team'].unique()[0] , ' vs ',temp['bowling_team'].unique()[0],": ", metrics.accuracy_score(temp['result'], temp['predictions']))

    


In [None]:
#accuracy of each macth
final_df = pd.DataFrame()

for i in test_all['mergeid'].unique():
    temp = test_all[test_all['mergeid'] == i]
    temp = temp.tail(30)
    print(temp['batting_team'].unique()[0] , ' vs ',temp['bowling_team'].unique()[0],": ", metrics.accuracy_score(temp['result'], temp['predictions']))

    


In [None]:
#test_all[['result','predictions']]
metrics.accuracy_score(test_all['result'], test_all['predictions'])

In [None]:
results['Trial_number'] = trial_model
results['summary'] = 'columns changed'



In [None]:
results.to_csv(r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\summary\summary_' +str(trial_model) + '.csv',index=False)
test_all.to_csv(r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Test_set\pred1_' +str(trial_model) + '.csv',index=False)
var_imp_df.to_csv(r'D:\Data Science\Projects\Cricket t20\Data\Analysis\Predictions\Var_imp\varimp_' + str(trial_model) + '.csv',index=False)