# Kaggle Competition

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchtext.transforms import LabelToIndex, ToTensor


  from .autonotebook import tqdm as notebook_tqdm


## Feature design

In [2]:
train = pd.read_csv('train.csv')
train_data = train[:140000].copy()
val_data = train[140000:].copy()
train

Unnamed: 0,Id,Band Name,Band Genre,Band Country of Origin,Band Debut,Concert ID,Concert Attendance,Inside Venue,Rain,Seated,Personnality Trait 1,Personnality Trait 2,Personnality Trait 3,Personnality Trait 4,Concert Goer Age,Concert Goer ID,Height (cm),Concert Goer Country of Origin,Concert Enjoyment
0,ConcertExperience_180106,Teenage Crazy Blue Knickers,Indie/Alt Rock,United States of America (USA),1976.0,900.0,2980.0,False,False,,0.330843,-0.958408,-0.943548,-1.636806,29.0,concert_goer_1985,140.0,Paraguay,Did Not Enjoy
1,ConcertExperience_146268,Beyond Devon,Pop Music,United States of America (USA),1968.0,731.0,54.0,True,False,True,-2.069449,0.017777,-1.910675,0.610265,43.0,concert_goer_1874,158.0,United Kingdom (UK),Enjoyed
2,ConcertExperience_128743,Ron Talent,Rock n Roll,Canada,1955.0,,162754.0,False,False,True,-0.484268,1.968772,-0.064167,-1.260871,68.0,concert_goer_442,159.0,United States of America (USA),Did Not Enjoy
3,ConcertExperience_140839,Devon Revival,RnB,United States of America (USA),1992.0,704.0,8103.0,False,True,False,-0.858054,1.022827,-0.348389,-1.147251,17.0,concert_goer_1149,150.0,Canada,Worst Concert Ever
4,ConcertExperience_19149,Beyond Devon,Pop Music,United States of America (USA),1968.0,95.0,54.0,False,False,False,-0.793029,-1.166528,-0.043766,0.969661,59.0,concert_goer_930,166.0,United Kingdom (UK),Did Not Enjoy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169995,ConcertExperience_14055,Crazy Joystick Cult,RnB,Canada,1985.0,70.0,162754.0,True,False,False,-0.095021,0.175175,0.914245,0.357359,50.0,concert_goer_707,180.0,United States of America (USA),Did Not Enjoy
169996,ConcertExperience_192792,Crazy Joystick Cult,RnB,Canada,1985.0,963.0,54.0,False,False,False,-0.733719,-0.285776,-0.323312,0.641180,71.0,concert_goer_1373,143.0,Bulgaria,Worst Concert Ever
169997,ConcertExperience_152942,"Why Frogs, Why?",Heavy Metal,Canada,2005.0,764.0,54.0,False,False,False,0.744969,-0.965547,1.020598,1.027389,27.0,concert_goer_1286,176.0,Canada,Did Not Enjoy
169998,ConcertExperience_138957,Twilight of the Joystick Gods,Hip Hop/Rap,United States of America (USA),1995.0,694.0,22026.0,False,True,True,0.821976,0.351411,0.175762,1.455654,39.0,concert_goer_1845,176.0,Canada,Did Not Enjoy


In [21]:
    
class ConcertDataset(Dataset):
    def __init__(self, data: pd.DataFrame, train_stats: dict):
        """Constructor

        Args:
            data (pd.DataFrame): Raw data
        """
        self.data = data
        self.label_map = {
            'Worst Concert Ever': 0,
            'Did Not Enjoy': 1,
            'Enjoyed': 2,
            'Best Concert Ever': 3
        }

        # text transforms
        self.band_name_size = train_stats['unique_band_name'].size
        self.band_name_transform = LabelToIndex( train_stats['unique_band_name'].tolist() )

        self.band_genre_size = train_stats['unique_band_genre'].size
        self.band_genre_transform = LabelToIndex( train_stats['unique_band_genre'].tolist() )

        self.band_country_size = train_stats['unique_band_country'].size
        self.band_country_transform = LabelToIndex( train_stats['unique_band_country'].tolist() )

        self.goer_country_size = train_stats['unique_goer_country'].size
        self.goer_country_transform = LabelToIndex( train_stats['unique_goer_country'].tolist() )

        self.concert_enjoyment_size = train_stats['unique_concert_enjoyment'].size
        self.concert_enjoyment_transform = LabelToIndex( train_stats['unique_concert_enjoyment'].tolist() )

        # number transforms
        self.minmax_band_debut = train_stats['minmax_band_debut']
        self.minmax_concert_attendance = train_stats['minmax_concert_attendance']
        self.minmax_goer_age = train_stats['minmax_goer_age']
        self.minmax_height = train_stats['minmax_height']
        
    def min_max_scaler(self, x: float, min: float, max: float) -> torch.Tensor:
        return torch.tensor([(x - min) / (max - min)], dtype=torch.float)

    def one_hot_encoder(self, x: int, size: int) -> torch.Tensor:
        return torch.zeros(size, dtype=torch.float).scatter_(0, torch.tensor(x), value=1)
    
    def transform(self, sample: pd.Series) -> torch.Tensor:
        
        band_name_tensor = self.one_hot_encoder(
            self.band_name_transform( sample['Band Name'] ),
            self.band_name_size
        )
        band_genre_tensor = self.one_hot_encoder(
            self.band_genre_transform( sample['Band Genre'] ),
            self.band_genre_size
        )
        band_country_tensor = self.one_hot_encoder(
            self.band_country_transform( sample['Band Country of Origin'] ),
            self.band_country_size
        )
        goer_country_tensor = self.one_hot_encoder(
            self.goer_country_transform( sample['Concert Goer Country of Origin'] ),
            self.goer_country_size
        )

        band_debut_tensor = self.min_max_scaler( 
            sample['Band Debut'], 
            *self.minmax_band_debut
        )
        concert_attendance_tensor = self.min_max_scaler(
            sample['Concert Attendance'],
            *self.minmax_concert_attendance
        )
        goer_age_tensor = self.min_max_scaler(
            sample['Concert Goer Age'],
            *self.minmax_goer_age
        )
        height_tensor = self.min_max_scaler(
            sample['Height (cm)'],
            *self.minmax_height
        )
        
        inside_venue_tensor = torch.tensor([float(sample['Inside Venue'])], dtype=torch.float)
        rain_tensor = torch.tensor([float(sample['Rain'])], dtype=torch.float)
        seated_tensor = torch.tensor([float(sample['Seated'])], dtype=torch.float)

        traits_tensor = torch.tensor([
            sample['Personnality Trait 1'],
            sample['Personnality Trait 2'],
            sample['Personnality Trait 3'],
            sample['Personnality Trait 4']
        ], dtype=torch.float)

        feature_tensor = torch.cat([
            band_name_tensor,
            band_genre_tensor,
            band_country_tensor,
            goer_country_tensor,
            band_debut_tensor,
            concert_attendance_tensor,
            goer_age_tensor,
            height_tensor,
            inside_venue_tensor,
            rain_tensor,
            seated_tensor,
            traits_tensor
        ], dim=0)

        return feature_tensor

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> tuple:
        sample = self.data.iloc[idx]
        label_tensor = torch.tensor(self.label_map[sample['Concert Enjoyment']], dtype=torch.long)
        feature_tensor = self.transform(sample)
        return feature_tensor, label_tensor

In [22]:
value = {
    'Band Name': '',
    'Band Genre': '',
    'Band Country of Origin': '',
    'Band Debut': train_data['Band Debut'].mean(),
    'Concert Attendance': train_data['Concert Attendance'].mean(),
    'Inside Venue': train_data['Inside Venue'].mean(),
    'Rain': train_data['Rain'].mean(),
    'Seated': train_data['Seated'].mean(),
    'Personnality Trait 1': train_data['Personnality Trait 1'].mean(),
    'Personnality Trait 2': train_data['Personnality Trait 2'].mean(),
    'Personnality Trait 3': train_data['Personnality Trait 3'].mean(),
    'Personnality Trait 4': train_data['Personnality Trait 4'].mean(),
    'Concert Goer Age': train_data['Concert Goer Age'].mean(),
    'Height (cm)': train_data['Height (cm)'].mean(),
    'Concert Goer Country of Origin': ''
}

train_data.fillna(value=value, inplace=True)
val_data.fillna(value=value, inplace=True)

train_stats = {
    'unique_band_name': train_data['Band Name'].unique(),
    'unique_band_genre': train_data['Band Genre'].unique(),
    'unique_band_country': train_data['Band Country of Origin'].unique(),
    'unique_goer_country': train_data['Concert Goer Country of Origin'].unique(),
    'unique_concert_enjoyment': train_data['Concert Enjoyment'].unique(),
    'minmax_band_debut': (train_data['Band Debut'].min(), train_data['Band Debut'].max()),
    'minmax_concert_attendance': (train_data['Concert Attendance'].min(), train_data['Concert Attendance'].max()),
    'minmax_goer_age': (train_data['Concert Goer Age'].min(), train_data['Concert Goer Age'].max()),
    'minmax_height': (train_data['Height (cm)'].min(), train_data['Height (cm)'].max())
}


In [23]:
train_dataset = ConcertDataset(data=train_data, train_stats=train_stats)
val_dataset = ConcertDataset(data=val_data, train_stats=train_stats)


## Algorithms

In [24]:
class FNNClassificationModel(nn.Module):
    def __init__(self, num_nodes) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(233, num_nodes),
            nn.ReLU(),
            nn.Linear(num_nodes, num_nodes),
            nn.ReLU(),
            nn.Linear(num_nodes, 4)
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


## Methodology

In [25]:
def validation(val_dataloader: DataLoader, model: nn.Module, loss_fn: nn.CrossEntropyLoss) -> float:
    """Validation loop

    Args:
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.CrossEntropyLoss)

    Returns:
        float: val loss
    """
    size = len(val_dataloader.dataset)
    num_batches = len(val_dataloader)
    val_loss = 0.
    correct = 0.

    with torch.no_grad():
        for feature_tensors, label_tensors in val_dataloader:
            outputs = model(feature_tensors)
            val_loss += loss_fn(outputs, label_tensors).item()
            correct += (outputs.argmax(1) == label_tensors).type(torch.float).sum().item()
    
    val_loss /= num_batches
    correct /= size
    print(f"Validation Error:\n    Accuracy: {(100*correct):>.2f}%\n    Loss: {val_loss:>.8f}\n")
    return val_loss, correct


def epoch(train_dataloader: DataLoader, val_dataloader: DataLoader, 
            model: nn.Module, loss_fn: nn.CrossEntropyLoss, optimizer: torch.optim.Adam,
            val_every_x_step: int) -> tuple:
    """Train one epoch

    Args:
        train_dataloader (DataLoader)
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.CrossEntropyLoss)
        optimizer (torch.optim.Adam)
        val_every_x_step (int)

    Returns:
        tuple
    """
    epoch_train_losses = []
    epoch_train_steps = []
    epoch_val_losses = []
    epoch_val_accuracies = []
    epoch_val_steps = []

    size = len(train_dataloader.dataset)

    for step, (feature_tensors, label_tensors) in enumerate(train_dataloader):

        # forward pass
        outputs = model(feature_tensors)
        loss = loss_fn(outputs, label_tensors)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss = loss.item()

        epoch_train_losses.append(loss)
        epoch_train_steps.append(step)

        if step % 100 == 0:
            current = step * len(feature_tensors)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        
        if step % val_every_x_step == 0 or step == 0:

            # forward pass
            val_loss, correct = validation(val_dataloader, model, loss_fn)
        
            epoch_val_losses.append(val_loss)
            epoch_val_accuracies.append(correct)
            epoch_val_steps.append(step)
    
    return epoch_train_losses, epoch_train_steps, epoch_val_losses, epoch_val_accuracies, epoch_val_steps


def train(num_epochs: int, train_dataloader: DataLoader, val_dataloader: DataLoader, 
            model: nn.Module, loss_fn: nn.MSELoss, optimizer: torch.optim.Adam,
            val_every_x_step: int) -> tuple:
    """Train loop

    Args:
        num_epochs (int)
        train_dataloader (DataLoader)
        val_dataloader (DataLoader)
        model (nn.Module)
        loss_fn (nn.MSELoss)
        optimizer (torch.optim.Adam)
        val_every_x_step (int)

    Returns:
        tuple
    """
    train_losses = []
    train_steps = []
    val_losses = []
    val_accuracies = []
    val_steps = []
    epoch_last_step = 0

    for t in range(num_epochs):
        print(f"Epoch {t+1}\n-------------------------------")

        epoch_train_losses, epoch_train_steps, \
            epoch_val_losses, epoch_val_accuracies, epoch_val_steps = \
                epoch(train_dataloader, val_dataloader, model, loss_fn, optimizer,
                        val_every_x_step)
        
        train_losses += epoch_train_losses
        train_steps += [step + epoch_last_step for step in epoch_train_steps]

        val_losses += epoch_val_losses
        val_accuracies += epoch_val_accuracies
        val_steps += [step + epoch_last_step for step in epoch_val_steps]

        epoch_last_step = train_steps[-1]

    return train_losses, train_steps, val_losses, val_accuracies, val_steps


In [26]:

# FNN hyperparameters
N_NODES = 100

# training hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
N_EPOCHS = 100

# DataLoader wraps an iterable around the Dataset
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
# test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)


model = FNNClassificationModel(N_NODES).to('cpu')


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


# train model
train_losses, train_steps, val_losses, val_accuracies, val_steps = \
    train(N_EPOCHS, train_dataloader, val_dataloader, model, loss_fn, optimizer, 1000)
val_loss = val_losses[-1]


# save model weights
torch.save(model.state_dict(), f'storage/FNN_{N_NODES}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_ep{N_EPOCHS}.pth')


# save stats
stats = pd.DataFrame(
    {   
        'train_steps': pd.Series(train_steps),
        'train_losses': pd.Series(train_losses),
        'val_steps': pd.Series(val_steps),
        'val_accuracies': pd.Series(val_accuracies),
        'val_losses': pd.Series(val_losses)
    }
)
stats.to_pickle(f'storage/FNN_{N_NODES}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_ep{N_EPOCHS}_tl{val_loss:.8f}.pkl')


Epoch 1
-------------------------------
loss: 1.384612  [    0/140000]
Validation Error:
    Accuracy: 39.22%
    Loss: 1.77426156

loss: 1.107848  [ 3200/140000]
loss: 0.878663  [ 6400/140000]
loss: 0.914710  [ 9600/140000]
loss: 0.957555  [12800/140000]
loss: 0.799328  [16000/140000]
loss: 0.844644  [19200/140000]
loss: 0.803764  [22400/140000]
loss: 0.722627  [25600/140000]
loss: 0.834165  [28800/140000]
loss: 0.815363  [32000/140000]
Validation Error:
    Accuracy: 60.68%
    Loss: 1.87020846

loss: 0.957026  [35200/140000]
loss: 0.782009  [38400/140000]
loss: 0.955911  [41600/140000]
loss: 0.890720  [44800/140000]
loss: 58.250828  [48000/140000]
loss: 0.845628  [51200/140000]
loss: 0.861255  [54400/140000]
loss: 0.637175  [57600/140000]
loss: 0.885761  [60800/140000]
loss: 0.940790  [64000/140000]
Validation Error:
    Accuracy: 61.38%
    Loss: 1.12924336

loss: 0.746684  [67200/140000]
loss: 1.041898  [70400/140000]
loss: 0.882499  [73600/140000]
loss: 0.932042  [76800/140000]
l

## Results

In [None]:
model = FNNClassificationModel()
model.load_state_dict(torch.load(''))
model.eval()