In [3]:
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv("cleaned.csv")

# Categorise Columns

In [5]:
num_cols = list(df.select_dtypes(include=['int', 'float']).columns)
cat_cols = list(df.select_dtypes(exclude=['int', 'float']).columns)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Custom IQR Removal Transformer

In [7]:
class Outlier_Remover(BaseEstimator, TransformerMixin):
    
    def __init__(self, action='keep'): 
        self.action = action
        
    def fit(self, X, y=None):
        self.median_ = np.median(X, axis=0)
        return self
    
    def transform(self, X):
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0) 
        
        IQR = Q3 - Q1

        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        
        outlier_mask = (X < lower) | (X > upper)
        
        if self.action == 'drop':
            return X[~outlier_mask]
        else:
            for i in range(X.shape[1]):
                X[:, i][outlier_mask[:, i]] = self.median_[i]
            return X

# Define Pipeline's for both type of columns

In [9]:
cat_preprocessor = Pipeline(steps=[
    ("cat_null_handler",SimpleImputer(missing_values=np.nan,strategy="most_frequent")),
    ("cat_enocder",OrdinalEncoder()),
])


num_preprocessor = Pipeline(steps=[
    ("num_null_handler",SimpleImputer(missing_values=np.nan,strategy='median')),
    ("num_outlier_remover",Outlier_Remover(action="keep")),
    ("num_scaler",StandardScaler()),
])


# Preprocess data with Pipeline

In [10]:
df[num_cols] = num_preprocessor.fit_transform(df[num_cols])
df[cat_cols] = cat_preprocessor.fit_transform(df[cat_cols])
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,venue_x,innings,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,...,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,umpire1,umpire2,umpire3,ball_number
0,-1.730897,1.415477,23.0,-0.966513,4.0,0.0,617.0,554.0,124.0,-1.121212,...,0.0,1.0,0.0,-0.605949,0.524566,61.0,37.0,42.0,33.0,-1.68436
1,-1.730883,1.415477,23.0,-0.966513,4.0,0.0,617.0,554.0,124.0,-1.121212,...,0.0,1.0,0.0,-0.605949,0.524566,61.0,37.0,42.0,33.0,-1.655015
2,-1.730869,1.415477,23.0,-0.966513,4.0,0.0,617.0,554.0,124.0,0.570349,...,0.0,1.0,0.0,-0.605949,0.524566,61.0,37.0,42.0,33.0,-1.62567
3,-1.730855,1.415477,23.0,-0.966513,4.0,0.0,561.0,608.0,124.0,0.570349,...,0.0,1.0,0.0,-0.605949,0.524566,61.0,37.0,42.0,33.0,-1.596324
4,-1.73084,1.415477,23.0,-0.966513,4.0,0.0,617.0,554.0,124.0,0.570349,...,0.0,1.0,0.0,-0.605949,0.524566,61.0,37.0,42.0,33.0,-1.566979


# Train Test Split

In [11]:
X = df.drop(columns=["winner"])
y = df.winner

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Define the input shape
input_shape = X_train.shape[1]

# Create the model
model = Sequential()

# Add the input layer
model.add(Dense(512, activation='relu', input_shape=(input_shape,)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Add hidden layers
model.add(Dense(1024, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(2048, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Add the output layer
output_units = len(np.unique(y_train))
model.add(Dense(output_units, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stop])

2024-03-20 10:29:07.257098: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-20 10:29:07.261897: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/100
