In [None]:
#pip install fastai

In [None]:
from fastai.tabular import tabular_learner, TabularList, Categorify, Learner, accuracy
from fastai.train import ShowGraph
import pandas as pd
import numpy as np
import torch
import random
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12.5,5))
plt.rc('font', size=16)

**Set random seed**

In [None]:
def random_seed(seed_value, use_cuda=True):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [None]:
random_seed(123)

**Defining the Model Parameters**

In [None]:
params = {
    'epochs': 800,
    'bs': 128,
    'layers': [20,20,20,20],
    'wd': 0.0,
    'ps': 0.0,
    'emb_drop': 0.0,
    'lr': 1e-01,
    'use_GPU': False,
    'n_folds': 5
}

data_dir = 'poker_hand_data/'

In [None]:
def load_dataset():
    train = pd.read_csv(data_dir + 'poker-hand-training.data', header=None)
    test = pd.read_csv(data_dir + 'poker-hand-testing.data', header=None)

    train.columns = ['S1','C1','S2','C2','S3','C3','S4','C4','S5','C5','Hand']
    test.columns = ['S1','C1','S2','C2','S3','C3','S4','C4','S5','C5','Hand']
    
    print(f'Train shape:{train.shape}')
    print(f'Test shape:{test.shape}')
    
    return (train, test)

**Helper Functions**

In [None]:
def add_training_to_list(learn, train_losses:list, valid_losses:list, metrics:list):
    nb_epochs=len(learn.recorder.val_losses) 
    iterations_per_epoch = len(learn.recorder.losses)//nb_epochs
    for i in range(iterations_per_epoch-1,iterations_per_epoch*nb_epochs,iterations_per_epoch):
        train_losses.append(learn.recorder.losses[i].numpy())
    for i in range(nb_epochs):
        metrics.append(learn.recorder.metrics[i][0].numpy())
    valid_losses.extend(learn.recorder.val_losses)
    return train_losses, valid_losses, metrics

def add_fold_to_dataframe(train_losses:list, valid_losses:list, metrics:list, train_losses_df, valid_losses_df, metrics_df):
    train_losses = pd.Series(train_losses)
    valid_losses = pd.Series(valid_losses)
    metrics = pd.Series(metrics)

    train_losses_df = train_losses_df.append(train_losses, ignore_index=True)
    valid_losses_df = valid_losses_df.append(valid_losses, ignore_index=True)
    metrics_df = metrics_df.append(metrics, ignore_index=True)
    return train_losses_df, valid_losses_df, metrics_df

def get_avg_column_val(df):
    averages = list()
    for i in range(len(df.columns)):
        averages.append(df[i].mean())
    return averages

def kfold_results(train_losses_df, valid_losses_df, metrics_df):
    avg_train_losses = get_avg_column_val(train_losses_df)
    avg_valid_losses = get_avg_column_val(valid_losses_df)
    avg_metrics = get_avg_column_val(metrics_df)
    epochs = np.arange(len(avg_train_losses))
    results_lists = {'epochs': epochs, 'train_loss': avg_train_losses, 'valid_loss': avg_valid_losses, 'accuracy': avg_metrics}
    results = pd.DataFrame(results_lists)
    return results

def print_kfold_results(results):
    total_epochs = results.shape[0]
    last_accuracy = results.iloc[-1]['accuracy']
    last_train_loss = results.iloc[-1]['train_loss']
    last_valid_loss = results.iloc[-1]['valid_loss']
    print('-'*20, '\n', 'Overall results (averaged over folds)'); print('-'*20, '\n')
    print(f'Number of epochs: {total_epochs}')
    print(f'Accuracy: {last_accuracy}')
    print(f'Train loss: {last_train_loss}')
    print(f'Valid loss: {last_valid_loss}')
    print(results)
    #print(tabulate(results, headers='keys', tablefmt='fancy_grid', showindex=False))

def plot_kfold_results(results, bs):
    nb_epochs=results.shape[0]
    fig,ax = plt.subplots(2,1,figsize=(8,12))
    fig.suptitle('Results - averaged over folds')
    ax[0].plot(list(range(nb_epochs)), results['train_loss'], label='Training loss')
    ax[0].plot(list(range(nb_epochs)), results['valid_loss'], label='Validation loss')
    ax[0].set_xlabel('Epoch')
    ax[0].xaxis.set_ticks(np.arange(0,nb_epochs,1))
    ax[0].set_ylabel('Loss')
    ax[0].legend(loc='upper right')
    ax[1].plot(list(range(nb_epochs)),results['accuracy'])
    ax[1].xaxis.set_ticks(np.arange(0,nb_epochs,1))
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')

In [None]:
def test_metrics(learner:Learner, dep_var, test_df, procs=None, bs=64):
    '''Function to validate trained model on test set'''
    
    #get continious and categorical variable names from the learner
    cat_names=learner.data.cat_names
    cont_names=learner.data.cont_names
    
    #create fastai Databunch that holds the data in batches for the algorithm
    data = (TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names, procs=preProcc)
                    .split_none()
                    .label_from_df(dep_var)
                    .databunch(bs=bs))
    data.valid_dl = data.train_dl
    
    #validate algorithm
    learner.data.valid_dl = data.valid_dl
    return learner.validate()

In [None]:
def fit_model_cv(df):
    '''
    Function to fit model on training data with cross-validation
    
    Parameters:
    df: training data as pandas DataFrame
    
    Returns cross-validation results
    '''
    
    df=df.copy(); use_GPU=params['use_GPU'] 
    
    #tracking variables
    train_losses_df = pd.DataFrame()
    valid_losses_df = pd.DataFrame()
    metrics_df = pd.DataFrame()
    
    #prepare cross validation
    random_seed(123)
    stratified_k_fold = StratifiedKFold(params['n_folds'], shuffle=True, random_state=1)
    
    #iterate over folds
    for iteration_idx, (train_idxs, valid_idxs) in enumerate(stratified_k_fold.split(df.loc[:,df.columns!=dep_var[0]], df[dep_var])):
        fold_idx = iteration_idx+1
        print('-'*20, '\n', f'> Fold: {fold_idx}'); print('-'*20)
        
        #tracking variables
        train_losses = list()
        valid_losses = list()
        metrics = list()
        
        #create fastai Databunch that holds the data in batches for the algorithm
        random_seed(123)
        data = (TabularList.from_df(df, cat_names=cat_vars, procs=preProcc)
                            .split_by_idx(valid_idxs)
                            .label_from_df(dep_var)
                            .databunch(bs=params['bs']))
        
        #create fastai learner that handles the training and prediction of the neural network
        learn = tabular_learner(data, layers=params['layers'], metrics=accuracy, ps=params['ps'], emb_szs=emb_szs, emb_drop=params['emb_drop'], callback_fns=[ShowGraph])
        if use_GPU:
            learn = learn.to_fp16()
            
        #fit neural network
        random_seed(123)
        learn.fit_one_cycle(params['epochs'], max_lr=params['lr'], wd=params['wd'])
        
        #track fold
        train_losses, valid_losses, metrics = add_training_to_list(learn, train_losses, valid_losses, metrics)
        
        #add fold to dataframe
        train_losses_df, valid_losses_df, metrics_df = add_fold_to_dataframe(train_losses, valid_losses, metrics, train_losses_df, valid_losses_df, metrics_df)
    
    
    #show results    
    results = kfold_results(train_losses_df, valid_losses_df, metrics_df)  
    print_kfold_results(results)
    plot_kfold_results(results, params['bs'])

    return results

def fit_model_test(train, test):
    '''
    Function to fit model on complete training data and evaluate on test data
    
    Parameters:
    train: training data as pandas DataFrame
    test: testing data as pandas DataFrame
    
    Returns trained fastai Learner
    '''
    train=train.copy(); test=test.copy(); use_GPU=params['use_GPU'] 

    #create fastai Databunch that holds the data in batches for the algorithm
    random_seed(123)
    data = (TabularList.from_df(train, cat_names=cat_vars, procs=preProcc)
                    .split_none()
                    .label_from_df(dep_var)
                    .databunch(bs=params['bs']))
    print(data)
    
    #create fastai learner that handles the training and prediction of the neural network
    learn = tabular_learner(data, layers=params['layers'], metrics=accuracy, ps=params['ps'], emb_szs=emb_szs, emb_drop=params['emb_drop'], callback_fns=[ShowGraph])
    if use_GPU:
        learn = learn.to_fp16()
    
    #conduct and plot learning rate range test:
    '''
    random_seed(123)
    learn.lr_find()
    fig = learn.recorder.plot(suggestion=False, return_fig=True)
    '''
    
    #fit neural network on training set
    random_seed(123)
    learn.fit_one_cycle(params['epochs'], max_lr=params['lr'], wd=params['wd'])
        
    #evaluate neural network with test set
    metrics = np.asarray(test_metrics(learner=learn, dep_var=dep_var, test_df=test, procs=preProcc, bs=params['bs']))
    print('-'*15)
    print('Results:')
    print(f'Test loss: {metrics[0]}')
    print(f'Test Accuracy: {metrics[1]}')
    return learn

**Create & Train Neural Network**

In [None]:
#definition of the categorical variables and the dependent variable
cat_vars = ['S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5']

dep_var = ['Hand']

#define fastai data preparation processes
preProcc = [Categorify]

In [None]:
#load dataset
train,test = load_dataset()

#define the embedding dimension either through giving all variables the same dimension
#or by calculating the dimensions with the fastai rule
emb_szs = {}
print('Entity Embedding Dimensions:')
for column in test[cat_vars]:
    n_cat = test[column].nunique()
    emb_sz = min(600, round(1.6 * n_cat**0.56))
    #emb_sz = 7
    emb_szs[column] = emb_sz
    print(f'{column}: {n_cat} -> {emb_sz}')
params['emb_szs'] = emb_szs

In [None]:
#cross validation
fit_model_cv(train)

In [None]:
#fit model on complete training data and test with test data
learn = fit_model_test(train, test)