## ============   Scikit Learn: MLP    ===============

In [1]:
use_one_hot_encoding = 1     #  makes sense with neural network

# accuracy:  train, valid, test ~ 0.5       ===== BEST

# IMP POINT: seems like only decision trees are good for categorical data; but why nn is not good at it?
#                                                                              even with one hot encoding?

need_label_encoding = ['team','host','opp','month', 'day_match']

### Loading Data

In [2]:
import pandas as pd
df = pd.read_csv('dataset_cricket_match/train.csv')
df = df.drop("Unnamed: 0", axis = 1)
df.head(5)
# fow: fall of wicket;   rpo: run per over or run rate

Unnamed: 0,team,opp,host,year,month,toss,day_match,bat_first,format,fow,score,rpo,result
0,australia,south_africa,sri_lanka,2012,sep,1,0,0,1,5,146,7.3,1
1,india,australia,india,2020,jan,0,0,1,0,6,340,6.8,1
2,canada,scotland,scotland,2009,jul,1,1,0,0,4,286,5.72,1
3,australia,england,australia,1987,jan,1,1,1,0,6,225,4.5,1
4,new_zealand,pakistan,uae,2009,nov,0,0,0,1,5,153,7.65,0


### Convert Categorical Data to One Hot Encoding

In [3]:
def convert_df_to_one_hot(df):
    for attr_name in need_label_encoding:
        unique_attr = df[attr_name].unique()
        for attr_val in unique_attr:
            new_attr_name = attr_name + "_" + str(attr_val)
            df[new_attr_name] = 0
            df.loc[df[attr_name] == attr_val, new_attr_name] = 1
        df = df.drop(attr_name, axis = 1) # drops columns
    return df
    
if use_one_hot_encoding == 1:
   df = convert_df_to_one_hot(df) 
df.head()

Unnamed: 0,year,toss,bat_first,format,fow,score,rpo,result,team_australia,team_india,...,month_mar,month_jun,month_oct,month_feb,month_apr,month_may,month_dec,day_match_0,day_match_1,day_match_2
0,2012,1,0,1,5,146,7.3,1,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2020,0,1,0,6,340,6.8,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2,2009,1,0,0,4,286,5.72,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1987,1,1,0,6,225,4.5,1,1,0,...,0,0,0,0,0,0,0,0,1,0
4,2009,0,0,1,5,153,7.65,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### STATS

In [4]:
print ("number of rows         : ", df.shape[0])
print ("number of features     : ", df.shape[1]-1)  # excluding the class labels
print ("number of class labels : ", 2)
#df.info()

number of rows         :  7827
number of features     :  79
number of class labels :  2


### Identify Attributes as Categorical or Continuous

In [5]:
attr_list = df.axes[1].tolist()
attr_list.remove('result')
cont_attr = ['fow', 'score', 'rpo']

### Convert to Numpy Arrays

In [6]:
import numpy as np
y_train = df['result'].to_numpy()
df      = df.drop("result", axis = 1)
x_train = df.to_numpy()
print ("data and features shape: ", x_train.shape)
print ("class lables shape     : ", y_train.shape)

data and features shape:  (7827, 79)
class lables shape     :  (7827,)


### MLP Classifier Train

In [7]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(random_state=1, max_iter=500, hidden_layer_sizes = [100,50,10,10,10,10]).fit(x_train, y_train)

# number of iter = 300 (epochs), relu, adam, learning rate = 0.001, batch size = 'auto', last activation: softmax, 
# hidden layer = [100], alpha = 0.0001
# setting random state ensures reproducibility

### Scikit Predict

In [8]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

train_df = pd.read_csv('dataset_cricket_match/train.csv')
valid_df = pd.read_csv('dataset_cricket_match/val.csv')
test_df  = pd.read_csv('dataset_cricket_match/test.csv')
df_list  = [train_df, valid_df, test_df] 
x_list   = []
y_list   = []

index = 0
for df in df_list:
    df = df.drop("Unnamed: 0", axis = 1)
    if use_one_hot_encoding == 1:           # converting categorical attributes to one hot encoded vector
        df = convert_df_to_one_hot(df) 
    print ("number of rows         : ", df.shape[0])
    
    y_data = df['result'].to_numpy()
    df     = df.drop("result", axis = 1)
    x_data = df.to_numpy()

    x_list.append(x_data)
    y_list.append(y_data)
    

number of rows         :  7827
number of rows         :  870
number of rows         :  967


In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
def getAccuracy(index, enable_prints):
    y_pred = clf.predict(x_list[index])
    y_pred = y_pred.tolist()
    y_true = y_list[index].tolist()
    classes = [0,1]
    if enable_prints == 1:
        print("PR Report         : \n", classification_report(y_true, y_pred, labels=classes, zero_division=0))
        print("Confusion Matrix  : \n", confusion_matrix(y_true, y_pred))
        #print("\nAccuracy        : ", accuracy_score(y_true, y_pred))
    return accuracy_score(y_true, y_pred)

df_list = [df]
for index in range(0, len(x_list)):
    print ("\naccuracy: ", getAccuracy(index,0))


accuracy:  0.5033857161108982

accuracy:  0.4735632183908046

accuracy:  0.4963805584281282
