## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
import lightgbm as lgb
import optuna

## Set file paths for train and predict datasets

In [2]:
train_dataset = "Dataset/Train.csv"
predict_dataset = "Dataset/Test.csv"

## Data Preprocessing

### Read train and predict datasets

In [3]:
train_df = pd.read_csv(train_dataset)
predict_df = pd.read_csv(predict_dataset)
print("train_df: {}".format(train_df.shape))
print("predict_df: {}".format(predict_df.shape))

train_df: (2508, 10)
predict_df: (1075, 9)


### Extract "MatchWinner" field from train_df into NumPy array

In [4]:
train_y = np.array([train_df['MatchWinner'].values]).T
train_df.drop(['MatchWinner'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (2508, 1)


### Combine train and predict dataframes

In [5]:
combined_df = train_df.append(predict_df, sort=False, ignore_index=True)
combined_df.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch
0,5,4,37,4,Home,Away,Second,First,Dec
1,1,14,84,7,Neutral,Neutral,First,Second,Sep
2,9,15,47,9,Home,Away,First,Second,Feb
3,7,2,102,6,Home,Away,First,Second,Aug
4,6,8,46,5,Home,Away,First,Second,Aug


### Feature Engineering

In [6]:
combined_df['Team1_Venue'] = combined_df['Team1_Venue'].apply(lambda x: 1 if x=='Home' else -1 if x=='Away' else 0)
combined_df['Team2_Venue'] = combined_df['Team2_Venue'].apply(lambda x: 1 if x=='Home' else -1 if x=='Away' else 0)

combined_df['Team1_Innings'] = combined_df['Team1_Innings'].apply(lambda x: 1 if x=='First' else 0)
combined_df['Team2_Innings'] = combined_df['Team2_Innings'].apply(lambda x: 1 if x=='First' else 0)

combined_df['MonthOfMatch'] = combined_df['MonthOfMatch'].apply(lambda x: {
        'Jan' : 1,
        'Feb' : 2,
        'Mar' : 3,
        'Apr' : 4,
        'May' : 5,
        'Jun' : 6,
        'Jul' : 7,
        'Aug' : 8,
        'Sep' : 9, 
        'Oct' : 10,
        'Nov' : 11,
        'Dec' : 12
}[x])

In [7]:
combined_df.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch
0,5,4,37,4,1,-1,0,1,12
1,1,14,84,7,0,0,1,0,9
2,9,15,47,9,1,-1,1,0,2
3,7,2,102,6,1,-1,1,0,8
4,6,8,46,5,1,-1,1,0,8


### Create train and predict Numpy arrays

In [8]:
# Segregate combined_df into train/predict datasets
train_x = combined_df[:2508].values
predict_x = combined_df[2508:].values

print("train_x: {}".format(train_x.shape))
print("predict_x: {}".format(predict_x.shape))

train_x: (2508, 9)
predict_x: (1075, 9)


In [9]:
# Scale the train_x/predict_x arrays
scaler = MinMaxScaler().fit(train_x)
train_x = scaler.transform(train_x)
predict_x = scaler.transform(predict_x)

In [10]:
train_x_full = train_x.copy()
train_y_full = train_y.copy()

### Split training data into train/test datasets

In [11]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.075, random_state=1)
for train_index, test_index in sss.split(train_x, train_y):
    train_x, test_x = train_x[train_index], train_x[test_index]
    train_y, test_y = train_y[train_index], train_y[test_index]

print("------------------------- Training Dataset -------------------------")
print("train_x shape: {}".format(train_x.shape))
print("train_y shape: {}".format(train_y.shape))

print("\n------------------------- Test Dataset -------------------------")
print("test_x shape: {}".format(test_x.shape))
print("test_y shape: {}".format(test_y.shape))

------------------------- Training Dataset -------------------------
train_x shape: (2319, 9)
train_y shape: (2319, 1)

------------------------- Test Dataset -------------------------
test_x shape: (189, 9)
test_y shape: (189, 1)


In [12]:
Xtrain_full, Ytrain_full = train_x_full.copy(), train_y_full.copy()
Xtrain, Ytrain = train_x.copy(), train_y.copy()
Xtest, Ytest = test_x.copy(), test_y.copy()
Xpredict = predict_x.copy()

print("------------------------- Training Dataset -------------------------")
print("Xtrain_full shape: {}".format(Xtrain_full.shape))
print("Ytrain_full shape: {}".format(Ytrain_full.shape))
print("Xtrain shape: {}".format(Xtrain.shape))
print("Ytrain shape: {}".format(Ytrain.shape))

print("\n------------------------- Test Dataset -------------------------")
print("Xtest shape: {}".format(Xtest.shape))
print("Ytest shape: {}".format(Ytest.shape))

print("\n------------------------- Prediction Dataset -------------------------")
print("Xpredict shape: {}".format(Xpredict.shape))

------------------------- Training Dataset -------------------------
Xtrain_full shape: (2508, 9)
Ytrain_full shape: (2508, 1)
Xtrain shape: (2319, 9)
Ytrain shape: (2319, 1)

------------------------- Test Dataset -------------------------
Xtest shape: (189, 9)
Ytest shape: (189, 1)

------------------------- Prediction Dataset -------------------------
Xpredict shape: (1075, 9)


## Hyperparameter search using Optuna

In [13]:
# Convert Ytest to one-hot encoding
df = pd.DataFrame(Ytest, columns=["target"])
test_y = pd.get_dummies(df['target']).values

In [14]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)

In [15]:
def objective(trial):
    
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 16,
        "verbosity": -1,
        "boosting_type": "gbdt",
        "is_unbalance": True,
        "max_bin": trial.suggest_int("max_bin", 256, 512),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 1e-1),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-4, 1.0),
        "num_leaves": trial.suggest_int("num_leaves", 35, 200),
        "max_depth": trial.suggest_int("max_depth", 8, 25),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 5, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 20)
    }
    
    y_pred = 0
    counter = 0
    for train, val in kfold.split(Xtrain, Ytrain):
        counter += 1
    
        train_x, train_y = Xtrain[train], Ytrain[train]
        val_x, val_y = Xtrain[val], Ytrain[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, valid_sets=[lgvalidation], 
                          num_boost_round=5000, early_stopping_rounds=200, verbose_eval=False)
        pred = model.predict(Xtest, num_iteration=model.best_iteration)

        y_pred += pred

    y_pred /= float(counter)
    loss = log_loss(test_y, y_pred)
    return loss

In [16]:
study = optuna.create_study()
study.optimize(objective, n_trials=500)

[I 2020-06-20 00:30:48,141] Finished trial#0 with value: 0.8880055772335186 with parameters: {'max_bin': 304, 'learning_rate': 0.020493613738234266, 'lambda_l2': 0.02111274952348149, 'num_leaves': 50, 'max_depth': 11, 'feature_fraction': 0.7189085574832109, 'bagging_fraction': 0.9387269619942173, 'bagging_freq': 16, 'min_child_samples': 7}. Best is trial#0 with value: 0.8880055772335186.
[I 2020-06-20 00:31:14,613] Finished trial#1 with value: 0.8125526870780237 with parameters: {'max_bin': 445, 'learning_rate': 0.02917560784435104, 'lambda_l2': 0.2689026873865235, 'num_leaves': 105, 'max_depth': 8, 'feature_fraction': 0.7317006255850408, 'bagging_fraction': 0.6075296725596281, 'bagging_freq': 20, 'min_child_samples': 20}. Best is trial#1 with value: 0.8125526870780237.
[I 2020-06-20 00:32:19,083] Finished trial#2 with value: 0.8152039384475309 with parameters: {'max_bin': 485, 'learning_rate': 0.010200631892092277, 'lambda_l2': 0.380641839757444, 'num_leaves': 132, 'max_depth': 24, 'f

KeyboardInterrupt: 

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print(" {}: {}".format(key, value))

## Build the model

In [None]:
# Define model hyperparameters
params = {}
params["objective"] = 'multiclass'
params["metric"] = 'multi_logloss'
params["num_class"] = 16
params["is_unbalance"] = True
params["boosting"] = 'gbdt'
params["max_bin"] = 396
params["learning_rate"] = 0.017
params["lambda_l2"] = 0.053
params["num_leaves"] = 51
params["max_depth"] = 12
params["feature_fraction"] = 0.95
params["bagging_fraction"] = 0.94
params["bagging_freq"] = 17
params["min_data_in_leaf"] = 1
params["verbosity"] = -1
num_rounds = 5000

In [None]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [None]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain, Ytrain):
    counter += 1

    train_x, train_y = Xtrain[train], Ytrain[train]
    val_x, val_y = Xtrain[val], Ytrain[val]

    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgvalidation], early_stopping_rounds=200, verbose_eval=1000)
    pred = model.predict(Xtest, num_iteration=model.best_iteration)

    y_pred += pred

y_pred /= float(counter)

## Validate the model

#### Print log_loss and accuracy

In [None]:
loss = log_loss(test_y, y_pred)
y_pred_label = np.array([np.argmax(y_pred, axis=1)]).T
acc_score = accuracy_score(Ytest, y_pred_label)
print('Overall log_loss:', loss)
print('Overall accuracy:', acc_score)

#### Print classification report

In [None]:
classification_report(Ytest, y_pred_label, 
                      labels=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15'])

## Train model on entire data

In [None]:
# Define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
y_pred = 0

In [None]:
# Train the model using K-fold
counter = 0

for train, val in kfold.split(Xtrain_full, Ytrain_full):
    counter += 1

    train_x, train_y = Xtrain_full[train], Ytrain_full[train]
    val_x, val_y = Xtrain_full[val], Ytrain_full[val]

    lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
    lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

    model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgvalidation], early_stopping_rounds=200, verbose_eval=1000)
    pred = model.predict(Xpredict, num_iteration=model.best_iteration)

    y_pred += pred

y_pred /= float(counter)

## Create submission file

In [None]:
submit_df = pd.DataFrame(y_pred, columns=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15'])
submit_df.head()

In [None]:
submit_df.to_excel("Predictions/predictions_v1.xlsx", index=False)