# Kaggle Competition

In [1]:
from typing import Iterable, Tuple
import joblib
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


## Feature design

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data


Unnamed: 0,Id,Band Name,Band Genre,Band Country of Origin,Band Debut,Concert ID,Concert Attendance,Inside Venue,Rain,Seated,Personnality Trait 1,Personnality Trait 2,Personnality Trait 3,Personnality Trait 4,Concert Goer Age,Concert Goer ID,Height (cm),Concert Goer Country of Origin,Concert Enjoyment
0,ConcertExperience_180106,Teenage Crazy Blue Knickers,Indie/Alt Rock,United States of America (USA),1976.0,900.0,2980.0,False,False,,0.330843,-0.958408,-0.943548,-1.636806,29.0,concert_goer_1985,140.0,Paraguay,Did Not Enjoy
1,ConcertExperience_146268,Beyond Devon,Pop Music,United States of America (USA),1968.0,731.0,54.0,True,False,True,-2.069449,0.017777,-1.910675,0.610265,43.0,concert_goer_1874,158.0,United Kingdom (UK),Enjoyed
2,ConcertExperience_128743,Ron Talent,Rock n Roll,Canada,1955.0,,162754.0,False,False,True,-0.484268,1.968772,-0.064167,-1.260871,68.0,concert_goer_442,159.0,United States of America (USA),Did Not Enjoy
3,ConcertExperience_140839,Devon Revival,RnB,United States of America (USA),1992.0,704.0,8103.0,False,True,False,-0.858054,1.022827,-0.348389,-1.147251,17.0,concert_goer_1149,150.0,Canada,Worst Concert Ever
4,ConcertExperience_19149,Beyond Devon,Pop Music,United States of America (USA),1968.0,95.0,54.0,False,False,False,-0.793029,-1.166528,-0.043766,0.969661,59.0,concert_goer_930,166.0,United Kingdom (UK),Did Not Enjoy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169995,ConcertExperience_14055,Crazy Joystick Cult,RnB,Canada,1985.0,70.0,162754.0,True,False,False,-0.095021,0.175175,0.914245,0.357359,50.0,concert_goer_707,180.0,United States of America (USA),Did Not Enjoy
169996,ConcertExperience_192792,Crazy Joystick Cult,RnB,Canada,1985.0,963.0,54.0,False,False,False,-0.733719,-0.285776,-0.323312,0.641180,71.0,concert_goer_1373,143.0,Bulgaria,Worst Concert Ever
169997,ConcertExperience_152942,"Why Frogs, Why?",Heavy Metal,Canada,2005.0,764.0,54.0,False,False,False,0.744969,-0.965547,1.020598,1.027389,27.0,concert_goer_1286,176.0,Canada,Did Not Enjoy
169998,ConcertExperience_138957,Twilight of the Joystick Gods,Hip Hop/Rap,United States of America (USA),1995.0,694.0,22026.0,False,True,True,0.821976,0.351411,0.175762,1.455654,39.0,concert_goer_1845,176.0,Canada,Did Not Enjoy


### Complete missing values

In [3]:
# train data statistics
value = train_data.mode().loc[0] # most frequent strategy

# fill nan values
train_data.fillna(value=value, inplace=True)
test_data.fillna(value=value, inplace=True)

# convert Concert ID to string
train_data['Concert ID'] = train_data['Concert ID'].map(lambda x: str(x))
test_data['Concert ID'] = test_data['Concert ID'].map(lambda x: str(x))


### Preprocessing

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, StandardScaler, LabelEncoder


# instantiate transformers
mlb = MultiLabelBinarizer()
mms = MinMaxScaler()
ss = StandardScaler()
le = LabelEncoder()


# fit transformers on train data
mlb.fit( train_data[['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert ID', 'Concert Goer ID', 'Concert Goer Country of Origin']].to_numpy() )
mms.fit( train_data[['Band Debut', 'Concert Attendance', 'Concert Goer Age', 'Height (cm)']].to_numpy() )
ss.fit( train_data[['Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3', 'Personnality Trait 4']].to_numpy() )
le.fit( train_data['Concert Enjoyment'].to_numpy() )


# apply transformers on train data
X_train = np.hstack((
    mlb.transform( train_data[['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert ID', 'Concert Goer ID', 'Concert Goer Country of Origin']].to_numpy() ),
    mms.transform( train_data[['Band Debut', 'Concert Attendance', 'Concert Goer Age', 'Height (cm)']].to_numpy() ),
    ss.transform( train_data[['Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3', 'Personnality Trait 4']].to_numpy() ),
    train_data[['Inside Venue', 'Rain', 'Seated']].to_numpy(dtype=np.float32)
))

y_train = le.transform( train_data['Concert Enjoyment'].to_numpy() )


# apply transformers on test data
X_test = np.hstack((
    mlb.transform( test_data[['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert ID', 'Concert Goer ID', 'Concert Goer Country of Origin']].to_numpy() ),
    mms.transform( test_data[['Band Debut', 'Concert Attendance', 'Concert Goer Age', 'Height (cm)']].to_numpy() ),
    ss.transform( test_data[['Personnality Trait 1', 'Personnality Trait 2', 'Personnality Trait 3', 'Personnality Trait 4']].to_numpy() ),
    test_data[['Inside Venue', 'Rain', 'Seated']].to_numpy(dtype=np.float32)
))


In [5]:
from sklearn.decomposition import SparsePCA


spca = SparsePCA(n_components=500, random_state=42)

spca.fit( X_train )


### Train / validation split

In [5]:
from sklearn.model_selection import train_test_split


# 0.9:0.1 (train:validation) split for model validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


### Torch Dataset

In [6]:
class ConcertDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor]:
        features = torch.tensor(self.X[idx], dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long) # type long required for CrossEntropyLoss
        return features, label


train_dataset = ConcertDataset(X_train, y_train)
val_dataset = ConcertDataset(X_val, y_val)


## Algorithms

In [32]:
from sklearn.linear_model import LogisticRegression # discriminative
from sklearn.svm import LinearSVC # discriminant-based
from sklearn.tree import DecisionTreeClassifier # decision tree


class FNN3Classifier(nn.Module):
    def __init__(self, in_features, num_nodes, out_features) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_features, num_nodes),
            nn.ReLU(),
            nn.Linear(num_nodes, num_nodes),
            nn.ReLU(),
            nn.Linear(num_nodes, out_features)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


class FNN8Classifier(nn.Module):
    def __init__(self, in_features, out_features) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(in_features, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, out_features)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


## Methodology

### Logistic Regression

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score


logistic_classifier = LogisticRegression(penalty='l2', random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 100000]
}

# k-fold cross-validator
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# hyper-parameter optimizer
logistic_classifier_grid_search = GridSearchCV(
    estimator=logistic_classifier,
    param_grid=param_grid,
    scoring='accuracy',
    cv=k_fold
)

# fit grid search
logistic_classifier_grid_search.fit(X_train, y_train)

# prediction
y_pred_logistic_classifier = logistic_classifier_grid_search.predict(X_val)

# compute validation accuracy
accuracy_logistic_classifier = accuracy_score(y_val, y_pred_logistic_classifier)

# save grid search
joblib.dump(logistic_classifier_grid_search, f'storage/logistic_classifier_acc{accuracy_logistic_classifier:.5f}.joblib')


### Linear Support Vector Classification

In [None]:

linear_svc = LinearSVC(penalty='l2', loss='squared_hinge', random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [1000, 10000, 100000]
}

# k-fold cross-validator
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# hyper-parameter optimizer
linear_svc_grid_search = GridSearchCV(
    estimator=linear_svc,
    param_grid=param_grid,
    scoring='accuracy',
    cv=k_fold
)

# fit grid search
linear_svc_grid_search.fit(X_train, y_train)

# prediction
y_pred_linear_svc = linear_svc_grid_search.predict(X_val)

# compute validation accuracy
accuracy_linear_svc = accuracy_score(y_val, y_pred_linear_svc)

# save grid search
joblib.dump(linear_svc_grid_search, f'storage/linear_svc_acc{accuracy_linear_svc:.5f}.joblib')


### Decision Tree Classifier

In [None]:

decision_tree_classifier = DecisionTreeClassifier(criterion='gini', random_state=42)

param_grid = {
    'max_depth': [10, 20, 30, 40, 50]
}

# k-fold cross-validator
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# hyper-parameter optimizer
decision_tree_classifier_grid_search = GridSearchCV(
    estimator=decision_tree_classifier,
    param_grid=param_grid,
    scoring='accuracy',
    cv=k_fold
)

# fit grid search
decision_tree_classifier_grid_search.fit(X_train, y_train)

# prediction
y_pred_decision_tree_classifier = decision_tree_classifier_grid_search.predict(X_val)

# compute validation accuracy
accuracy_decision_tree_classifier = accuracy_score(y_val, y_pred_decision_tree_classifier)

# save grid search
joblib.dump(decision_tree_classifier_grid_search, f'storage/decision_tree_classifier_acc{accuracy_decision_tree_classifier:.5f}.joblib')


### Neural Networks Pipeline

In [35]:

def validation(val_dataloader: DataLoader, model: nn.Module, loss_fn: nn.CrossEntropyLoss) -> float:
    """Validation loop

    Args:
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.CrossEntropyLoss)

    Returns:
        float: val loss
    """
    size = len(val_dataloader.dataset)
    num_batches = len(val_dataloader)
    val_loss = 0.
    correct = 0.

    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            val_loss += loss_fn(outputs, labels).item()
            correct += (outputs.argmax(1) == labels).type(torch.float).sum().item()
    
    val_loss /= num_batches
    correct /= size
    print(f"Validation Error:\n    Accuracy: {(100*correct):>.2f}%\n    Loss: {val_loss:>.8f}\n")
    return val_loss, correct


def epoch(train_dataloader: DataLoader, val_dataloader: DataLoader, 
            model: nn.Module, loss_fn: nn.CrossEntropyLoss, optimizer: torch.optim.Adam) -> tuple:
    """Train one epoch

    Args:
        train_dataloader (DataLoader)
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.CrossEntropyLoss)
        optimizer (torch.optim.Adam)

    Returns:
        tuple
    """
    epoch_train_losses = []
    epoch_train_steps = []

    size = len(train_dataloader.dataset)

    for step, (features, labels) in enumerate(train_dataloader):

        # forward pass
        outputs = model(features)
        loss = loss_fn(outputs, labels)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss = loss.item()

        epoch_train_losses.append(loss)
        epoch_train_steps.append(step)

        if step % 100 == 0:
            current = step * len(features)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        
        if step == 0:

            # forward pass
            val_loss, correct = validation(val_dataloader, model, loss_fn)
        
            epoch_val_loss = val_loss
            epoch_val_accuracy = correct
            epoch_val_step= step
    
    return epoch_train_losses, epoch_train_steps, epoch_val_loss, epoch_val_accuracy, epoch_val_step


def train(num_epochs: int, train_dataloader: DataLoader, val_dataloader: DataLoader, 
            model: nn.Module, loss_fn: nn.MSELoss, optimizer: torch.optim.Adam) -> tuple:
    """Train loop

    Args:
        num_epochs (int)
        train_dataloader (DataLoader)
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.MSELoss)
        optimizer (torch.optim.Adam)

    Returns:
        tuple
    """
    train_losses = []
    train_steps = []
    val_losses = []
    val_accuracies = []
    val_steps = []
    epoch_last_step = 0

    best_val_accuracy = 0

    try:
        for t in range(num_epochs):
            print(f"Epoch {t+1}\n-------------------------------")

            epoch_train_losses, epoch_train_steps, \
                epoch_val_loss, epoch_val_accuracy, epoch_val_step = \
                    epoch(train_dataloader, val_dataloader, model, loss_fn, optimizer)
            
            if epoch_val_accuracy > best_val_accuracy:
                best_val_accuracy = epoch_val_accuracy
                best_state = model.state_dict()

            train_losses += epoch_train_losses
            train_steps += [step + epoch_last_step for step in epoch_train_steps]

            val_losses.append( epoch_val_loss )
            val_accuracies.append( epoch_val_accuracy )
            val_steps.append( epoch_val_step + epoch_last_step )

            epoch_last_step = train_steps[-1]

    # early stopping
    except KeyboardInterrupt:
        pass

    return train_losses, train_steps, val_losses, val_accuracies, val_steps, best_val_accuracy, best_state


### Neural Networks Training

In [37]:

# FNN hyper-parameters
N_NODES = 1024

# training hyper-parameters
BATCH_SIZE = 256
LEARNING_RATE = 1e-4
N_EPOCHS = 10

# DataLoader wraps an iterable around the Dataset
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)


model = FNN3Classifier(in_features=X_train.shape[1], num_nodes=N_NODES, out_features=4).to('cpu')


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


# train model
train_losses, train_steps, val_losses, val_accuracies, val_steps, best_val_accuracy, best_state = \
    train(N_EPOCHS, train_dataloader, val_dataloader, model, loss_fn, optimizer)


# save model weights
torch.save(best_state, f'storage/FNN3_{N_NODES}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_ep{N_EPOCHS}_acc{best_val_accuracy:.5f}.pth')


# save stats
stats = {   
    'train_steps': train_steps,
    'train_losses': train_losses,
    'val_steps': val_steps,
    'val_accuracies': val_accuracies,
    'val_losses': val_losses
}

with open(f'storage/FNN3_{N_NODES}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_ep{N_EPOCHS}_acc{best_val_accuracy:.5f}.pkl', 'wb') as handle:
    pickle.dump(stats, handle)


Epoch 1
-------------------------------
loss: 1.392023  [    0/153000]
Validation Error:
    Accuracy: 40.27%
    Loss: 1.38555557



UnboundLocalError: local variable 'best_state' referenced before assignment

## Results

In [38]:
from sklearn.metrics import classification_report, confusion_matrix


### Neural Networks

In [42]:
N_NODES = 1024

model = FNN3Classifier(in_features=X_train.shape[1], num_nodes=N_NODES, out_features=4)
model.load_state_dict( torch.load('storage/FNN3_1024_bs256_lr0.0001_ep10_acc0.68035.pth') )
model.eval()

outputs = model( torch.tensor(X_test, dtype=torch.float32) )
predictions = le.inverse_transform( outputs.argmax(axis=1) )

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Predicted': predictions
})


In [45]:
submission.to_csv('submissions/FNN3_1024_bs256_lr0.0001_ep10_acc0.68035.csv', index=False)


In [11]:

def test(test_dataloader: DataLoader, model: nn.Module) -> float:
    """Validation loop

    Args:
        test_dataloader (DataLoader)
        model (nn.Module)

    Returns:
        float: val loss
    """
    predictions = []

    with torch.no_grad():
        for feature_tensors, _ in test_dataloader:
            outputs = model(feature_tensors)
            predictions.append(outputs.argmax(1))
    
    predictions = torch.cat(predictions, dim=0)

    return predictions.numpy()


In [20]:
label_inverse_map = {
    0: 'Worst Concert Ever',
    1: 'Did Not Enjoy',
    2: 'Enjoyed',
    3: 'Best Concert Ever'
}

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

predictions = test(test_dataloader, model)
predictions = pd.Series(predictions).apply( lambda x: label_inverse_map[x] )

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Predicted': predictions
})

submission


Unnamed: 0,Id,Predicted
0,ConcertExperience_70055,Best Concert Ever
1,ConcertExperience_34799,Enjoyed
2,ConcertExperience_100410,Enjoyed
3,ConcertExperience_106446,Did Not Enjoy
4,ConcertExperience_127249,Did Not Enjoy
...,...,...
29995,ConcertExperience_82288,Did Not Enjoy
29996,ConcertExperience_27139,Enjoyed
29997,ConcertExperience_197434,Enjoyed
29998,ConcertExperience_166029,Worst Concert Ever


In [21]:
submission.to_csv('submissions/FNN_100_bs32_lr0.001_ep12.csv', index=False)