## Second model

The final soft max layer was removed after reading the article  https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab?gi=5bca0efc08b3

This increased the accuracy of the model from 25% to 65%

Like the article the outputs were converted to log soft max during training and validation.

However, after reading the documentation
https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html, it appears that the log soft max conversion has already been taken into consideration in Cross Entropy loss function and so the log softmax conversion was removed for the final model.

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from joblib import dump
import category_encoders as ce

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
df = pd.read_csv("../data/raw/beer_reviews.csv")

In [5]:
df_clean = df.copy()
df_clean = df_clean.dropna(subset=['brewery_name'])

In [6]:
df_clean.reset_index(drop=True, inplace=True)

In [7]:
df_clean["beer_abv"]=df_clean.groupby("beer_style")["beer_abv"].transform(lambda x: x.fillna(x.mean()))

In [8]:
target = df_clean['beer_style']

In [9]:
le = LabelEncoder()

In [10]:
fitted_target = le.fit_transform(target)

In [11]:
brewery_id_new = pd.DataFrame(df_clean['brewery_id'])

brewery_id_new['brewery_id_count']=brewery_id_new.groupby('brewery_id')['brewery_id'].transform('count')
brewery_id_new['id_new'] = brewery_id_new['brewery_id_count'].transform(lambda x: x if x > 100 else 0)
brewery_id_new['id_new'] = brewery_id_new.loc[brewery_id_new['id_new'] > 100, 'brewery_id'].fillna(0)


brewery_id_new.fillna(0, inplace=True)


In [12]:
ce_target = ce.TargetEncoder(cols = ['id_new'], min_samples_leaf=270, smoothing = 0.5) #was 100,250 before, smoothing was 5

In [13]:
X=pd.DataFrame(brewery_id_new['id_new'])

In [14]:
Y=pd.DataFrame(fitted_target)

In [15]:
ce_target.fit(X,Y)

TargetEncoder(cols=['id_new'], drop_invariant=False, handle_missing='value',
              handle_unknown='value', min_samples_leaf=270, return_df=True,
              smoothing=0.5, verbose=0)

In [16]:
encoded_brewery_id=ce_target.transform(X,Y)

In [17]:
df_clean['encoded_brewery_id'] = encoded_brewery_id

In [18]:
num_cols=['review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv', 'encoded_brewery_id']

In [19]:
#Create matrix of X variables
X_analysis = df_clean[num_cols]

In [20]:
#All the features are now numerical. Scale the features.
sc = StandardScaler()
X_analysis=  sc.fit_transform(X_analysis)

In [43]:
#Split model into train, validation, and test dataset
X_train, X_test, y_train, y_test = train_test_split(X_analysis, fitted_target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=8) 

In [44]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [45]:
class PytorchMultiClass(nn.Module):
    def __init__(self, num_features):
        super(PytorchMultiClass, self).__init__()
        self.layer_1 = nn.Linear(num_features, 512)
        self.layer_2 = nn.Linear(512, 256)
        self.layer_3 = nn.Linear(256, 128)
        self.layer_out = nn.Linear(128, 104)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.tanh(self.layer_1(x))
        x = F.tanh(self.layer_2(x))
        x = F.tanh(self.layer_3(x))
        x = self.layer_out(x)
        return x #self.softmax(x)


In [46]:
model = PytorchMultiClass(X_train.shape[1])

In [47]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=512, bias=True)
  (layer_2): Linear(in_features=512, out_features=256, bias=True)
  (layer_3): Linear(in_features=256, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
)

In [48]:
criterion = nn.CrossEntropyLoss()

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [50]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = torch.log_softmax(model(feature),dim=1)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class.long())

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

In [51]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = torch.log_softmax(model(feature), dim=1)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class.long())

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

In [69]:
N_EPOCHS = 10
BATCH_SIZE = 1000

In [70]:
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0013	|	Acc: 64.8%
	(valid)	|	Loss: 0.0013	|	Acc: 64.4%
Epoch: 1
	(train)	|	Loss: 0.0013	|	Acc: 64.9%
	(valid)	|	Loss: 0.0013	|	Acc: 64.3%
Epoch: 2
	(train)	|	Loss: 0.0013	|	Acc: 64.9%
	(valid)	|	Loss: 0.0013	|	Acc: 64.4%
Epoch: 3
	(train)	|	Loss: 0.0013	|	Acc: 64.9%
	(valid)	|	Loss: 0.0013	|	Acc: 64.6%
Epoch: 4
	(train)	|	Loss: 0.0013	|	Acc: 65.0%
	(valid)	|	Loss: 0.0013	|	Acc: 64.6%
Epoch: 5
	(train)	|	Loss: 0.0013	|	Acc: 65.0%
	(valid)	|	Loss: 0.0013	|	Acc: 64.6%
Epoch: 6
	(train)	|	Loss: 0.0013	|	Acc: 65.0%
	(valid)	|	Loss: 0.0013	|	Acc: 64.6%
Epoch: 7
	(train)	|	Loss: 0.0013	|	Acc: 65.0%
	(valid)	|	Loss: 0.0013	|	Acc: 64.4%
Epoch: 8
	(train)	|	Loss: 0.0013	|	Acc: 65.1%
	(valid)	|	Loss: 0.0013	|	Acc: 64.5%
Epoch: 9
	(train)	|	Loss: 0.0013	|	Acc: 65.1%
	(valid)	|	Loss: 0.0013	|	Acc: 64.7%


In [71]:
torch.save(model.state_dict(), "../models/beeroracle_final_nosoftmax.pt")

In [72]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.2f}')

	Loss: 0.0013	|	Accuracy: 0.65
