#CNN Training and Evaluation
This notebook contains code for importing and calibrating data, training, and testing for our CNN sentiment analysis model

# Importing relevant libraries

In [None]:
%pip install autocorrect
%pip install madgrad

In [None]:
import pandas as pd
import gzip
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets
import madgrad

#Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Data processing code

In [None]:
'''
Takes preprocessed code and puts it into a format that will be used by the CNN model

If @binary is True, transforms the data from five class into a simple 1/0 positve/negative
'''

def cleanup(df, binary):
    df = df[['title_plus_review', 'overall']]
    df['overall'] = df['overall'].apply(lambda x: x if isinstance(x, float) else None)
    if binary:
        df['overall_adj'] = df['overall'].apply(lambda x: 1 if x > 3 else 0)
        df = df[['overall_adj', 'title_plus_review']]
        df = df.rename(columns = {'overall_adj': 'overall', 'title_plus_review': 'reviewText'})
    df = df.dropna()
    df = df.rename(columns = {'title_plus_review': 'reviewText'})
    return df

In [None]:
'''
Tokenizes the data and removes stopwords
'''
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

stop_words = set(stopwords.words('english')) 
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

def tokenize(sentence):
  tokens = token.tokenize(sentence)
  filtered = [x for x in tokens if not x in stop_words]
  return filtered

In [None]:
'''
Outputs the processed dataframes into json format, which will ultimately be read
by the Torch Dataset
'''
def output_to_json(train_df, test_df):
    train_json = train_df.to_json(orient = 'records')
    train_json_result = json.loads(train_json)
    with open('sample_data/train.json', 'w') as f:
      for entry in train_json_result:
        json.dump(entry, f)
        f.write('\n')

    test_json = test_df.to_json(orient = 'records')
    test_json_result = json.loads(test_json)
    with open('sample_data/test.json', 'w') as f:
      for entry in test_json_result:
        json.dump(entry, f)
        f.write('\n')

In [None]:
'''
Constructs the torch datasets and imports pre-trained word embeddings

Changes the classificaiton method if binary vs. five class
'''
def get_data_tokens_score(binary):
    TOKENS = data.Field(lower = True, batch_first = True)
    if binary:
        SCORE = data.LabelField(dtype = torch.float)
    else:
        SCORE = data.LabelField(dtype = torch.long)

    fields = {'tokenized': ('tokens', TOKENS), 'overall': ('score', SCORE)}
    train_data, test_data = data.TabularDataset.splits(
        path = 'sample_data',
        train = 'train.json',
        test = 'test.json',
        format = 'json',
        fields = fields
    )

    TOKENS.build_vocab(train_data, 
                      max_size = 10000,
                      vectors = "glove.6B.100d", 
                      unk_init = torch.Tensor.normal_)
    SCORE.build_vocab(train_data)

    return train_data, test_data, TOKENS, SCORE

In [None]:
'''
Builds Torch iterators for the loaded data
'''

import torch

def get_iters(train_data, test_data):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_iterator = data.BucketIterator(train_data, sort_key = lambda x: x.tokens, 
                                        sort = False, sort_within_batch = True, batch_size= 64, device = device)
    test_iterator = data.BucketIterator(test_data, sort_key = lambda x: x.tokens, 
                                        sort = False, sort_within_batch = True, batch_size= 64, device = device)
    return train_iterator, test_iterator

In [None]:
'''
Method that combines all of the above into a single method and provides the 
relevant inputs that will be fed to the CNN 
'''
def get_iters_tokens(dataset, binary):
    path = '/content/drive/Shareddrives/519 Project/Data/preprocessed/Final Data/'
    if dataset == 'electronics':
        electronics_train_path = os.path.join(path, 'electronics_train.csv')
        electronics_test_path = os.path.join(path, 'electronics_test.csv')

        train_df = pd.read_csv(electronics_train_path)
        test_df = pd.read_csv(electronics_test_path)
    else:
        allcats_train_path = os.path.join(path, 'all_train.csv')
        allcats_test_path = os.path.join(path, 'all_test.csv')

        train_df = pd.read_csv(allcats_train_path)
        test_df = pd.read_csv(allcats_test_path)
    
    train_df = cleanup(train_df, binary)
    test_df = cleanup(test_df, binary)

    train_df['tokenized'] = train_df['reviewText'].apply(tokenize)
    train_df = train_df[['tokenized', 'overall']]

    test_df['tokenized'] = test_df['reviewText'].apply(tokenize)
    test_df = test_df[['tokenized', 'overall']]

    output_to_json(train_df, test_df)

    train_data, test_data, TOKENS, SCORE = get_data_tokens_score(binary)
    train_iterator, test_iterator = get_iters(train_data, test_data)

    return train_iterator, test_iterator, TOKENS

## CNN's

In [None]:
'''
Outlines the structure for our CNN model

model architecture from: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
used as a starting point
'''
import torch.nn as nn
import torch.nn.functional as F

class CNN_Text(nn.Module):
    def __init__(self, vocab_size, 
                 vector_size, n_filters, 
                 filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vector_size, 
                                      padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, vector_size)) 
                                    for fs in filter_sizes])
        
        self.linear = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
        
        
    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.linear(cat)

In [None]:
'''
Returns a new base CNN model
@num_classes how many classes
@TOKENS contains the pre-trained embeddings
'''
def get_cnn_model(num_classes, TOKENS):
  input_dim = len(TOKENS.vocab)
  embedding_dim = 100
  n_filters = 100
  filter_sizes = [1,2,3,4]
  output_dim = num_classes
  dropout = .3
  pad_idx = TOKENS.vocab.stoi[TOKENS.pad_token]

  model = CNN_Text(input_dim, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx)
  pretrained_embeddings = TOKENS.vocab.vectors
  model.embedding.weight.data.copy_(pretrained_embeddings)

  unk_idx = TOKENS.vocab.stoi[TOKENS.unk_token]

  model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
  model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

  return model

Below methods used for calculating relevant metrics

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score

def get_f1_score(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    score = f1_score(y.cpu().detach().numpy(), rounded_preds.cpu().detach().numpy(), average = 'macro')
    return score

In [None]:
def get_precision_recall(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    p_score = precision_score(y.cpu().detach().numpy(), rounded_preds.cpu().detach().numpy(), average = 'macro')
    r_score = recall_score(y.cpu().detach().numpy(), rounded_preds.cpu().detach().numpy(), average = 'macro')
    return p_score, r_score

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def five_f1_score(preds, y):
    _, predicted = torch.max(preds, 1)
    score = f1_score(y.cpu().detach().numpy(), predicted.cpu().detach().numpy(), average = 'macro')
    return score

In [None]:
def five_precision_recall(preds, y):
    _, predicted = torch.max(preds, 1)
    p_score = precision_score(y.cpu().detach().numpy(), predicted.cpu().detach().numpy(), average = 'macro')
    r_score = recall_score(y.cpu().detach().numpy(), predicted.cpu().detach().numpy(), average = 'macro')
    return p_score, r_score

In [None]:
def five_accuracy(preds, y):
    _, predicted = torch.max(preds, 1)
    correct = (predicted == y).sum().item()
    return correct / len(y)

In [None]:
"""
Basic train loop for cnn

Returns final accuracy, loss, f1 score, precision, and recall 
"""
def train_cnn(model, iterator, optimizer, criterion, epochs=10, print_intermediate = False, five_class = False):
    for child in model.children():
      if hasattr(child, 'reset_parameters'):
        child.reset_parameters()
    
    model = model.to(device)
    model.train()

    accuracy_list = []
    loss_list = []
    f1_list = []
    precision_list = []
    recall_list= []
    print('Starting Training\n')
    for epoch in range(epochs):
      epoch_acc = 0
      epoch_loss = 0
      epoch_f1 = 0
      epoch_precision = 0
      epoch_recall = 0
      i = 0
      seen_since_last_print = 0
      for batch in iterator:
        i += 1
        seen_since_last_print += 1

        inputs = batch.tokens
        labels = batch.score
        optimizer.zero_grad()
        outputs = model(inputs).squeeze(1)

        loss = criterion(outputs, labels)
        if five_class:
            epoch_f1 += five_f1_score(outputs, labels)
            epoch_acc += five_accuracy(outputs, labels)
            precision, recall = five_precision_recall(outputs, labels)
            epoch_precision += precision
            epoch_recall += recall
        else:
            epoch_f1 += get_f1_score(outputs, labels)
            epoch_acc += binary_accuracy(outputs, labels).item()
            precision, recall = get_precision_recall(outputs, labels)
            epoch_precision += precision
            epoch_recall += recall

        

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()


        if print_intermediate:
          if (seen_since_last_print / len(iterator)) > .1:
            percent = 100 * i / len(iterator)
            print('Epoch %d is %d%% done' % (epoch + 1, percent))
            seen_since_last_print = 0

      epoch_acc = epoch_acc / len(iterator)
      epoch_loss = epoch_loss / len(iterator)
      epoch_f1 = epoch_f1 / len(iterator)
      epoch_recall = epoch_recall / len(iterator)
      epoch_precision = epoch_precision / len(iterator)

      accuracy_list.append(epoch_acc)
      loss_list.append(epoch_loss)
      f1_list.append(epoch_f1)
      recall_list.append(epoch_recall)
      precision_list.append(epoch_precision)
      print('\nEpoch Num: %d, Accuracy: %.4f, Loss: %.4f, F1: %.4f, Precision: %.4f, Recall: %.4f\n' % (epoch + 1, epoch_acc, epoch_loss, epoch_f1, epoch_precision, epoch_recall))

    final_training_accuracy = accuracy_list[-1]     
    final_training_loss = loss_list[-1]
    final_training_f1 = f1_list[-1]
    final_training_precision = precision_list[-1]
    final_training_recall = recall_list[-1]
    print('Done training\n')
    return final_training_accuracy, final_training_loss, final_training_f1, final_training_precision, final_training_recall

In [None]:
'''
Test loop for CNN that outputs the same metrics as the train loop
'''
def test_cnn_model(model, iterator, criterion, five_class = False):
    model.eval()
    model.to(device)
    test_loss = 0
    test_acc = 0
    test_f1 = 0 
    test_precision = 0
    test_recall = 0

    with torch.no_grad():
      for batch in iterator:
        outputs = model(batch.tokens).squeeze(1)
        loss = criterion(outputs, batch.score)
        test_loss += loss.item()
        if five_class:
            test_acc += five_accuracy(outputs, batch.score)
            test_f1 += five_f1_score(outputs, batch.score)
            precision, recall = five_precision_recall(outputs, batch.score)
            test_precision += precision
            test_recall += recall
        else:
            test_acc += binary_accuracy(outputs, batch.score)
            test_f1 += get_f1_score(outputs, batch.score)
            precision, recall = get_precision_recall(outputs, batch.score)
            test_precision += precision
            test_recall += recall
    
    testing_accuracy = test_acc / len(iterator)   
    testing_loss = test_loss / len(iterator)
    testing_f1 = test_f1 / len(iterator)
    testing_precision = test_precision / len(iterator)
    testing_recall = test_recall / len(iterator) 
    return testing_accuracy, testing_loss, testing_f1, testing_precision, testing_recall

#Testing / Training 

In [None]:
'''
executes a single train/test loop based on the parameters
'''
def single_train_test_loop(summary_df, dataset_name, num_classes, optim_name, train_iter, test_iter, tokens):
    model = get_cnn_model(num_classes = num_classes, TOKENS = tokens)
    if optim_name == 'madgrad':
        optimizer = madgrad.MADGRAD(model.parameters(), lr = .001)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr = .001)
    
    if num_classes == 1:
        criterion = nn.BCEWithLogitsLoss()
    else:
        criterion = nn.CrossEntropyLoss()
    
    
    five_classes = (num_classes == 5)

    train_acc, train_loss, train_f1, train_precision, train_recall = train_cnn(model, train_iter, optimizer, criterion, print_intermediate=False, five_class=five_classes)
    test_acc, test_loss, test_f1, test_precision, test_recall = test_cnn_model(model, train_iter, criterion, five_class = five_classes)

    print('-----------------------------------')
    print('Test Statistics: ')
    print(test_acc, test_loss, test_f1, test_precision, test_recall)
    print('-----------------------------------')

    prediction_type = 'five class' if num_classes == 5 else 'binary'
    #new_row = {'dataset': dataset_name, 'predicton_type': prediction_type, 'optimizer': optim_name,
    #          'lr': .001, 'train_acc': train_acc, 'train_loss': train_loss, 'train_f1': train_f1,
    #          'test_acc': test_acc, 'test_loss': test_loss, 'test_f1': test_f1}
    #summary_df.append(new_row, ignore_index = True)
    return model, summary_df

In [None]:
"""
Comparing different algorithms
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cols = ['dataset', 'prediction_type', 'optimizer', 'lr', 'train_acc', 'train_loss', \
        'train_f1', 'train_precision', 'train_recall', 'test_acc', 'test_loss', 'test_f1'\
        'test_precision', 'test_recall']

summary_df = pd.DataFrame(columns = cols)

#compare binary electronics vs. binary allcats
bin_elec_train_iter, bin_elec_test_iter, bin_elec_tokens = get_iters_tokens(dataset = 'electronics', binary = True)
bin_elec_model, summary_df = single_train_test_loop(summary_df, 'electronics', 1, 'madgrad', \
                                        bin_elec_train_iter, bin_elec_test_iter,
                                        bin_elec_tokens)

print(summary_df)

bin_all_train_iter, bin_all_test_iter, bin_all_tokens = get_iters_tokens(dataset = 'all', binary = True)
bin_all_madgrad_model, summary_df = single_train_test_loop(summary_df, 'allcats', 1, 'madgrad', \
                                        bin_all_train_iter, bin_all_test_iter,
                                        bin_all_tokens)


#compare binary allcats vs. five allcats
five_all_train, five_all_test, five_all_tokens = get_iters_tokens(dataset = 'all', binary = False)
five_all_madgrad_model, summary_df = single_train_test_loop(summary_df, 'allcats', 5, 'madgrad', \
                                        five_all_train, five_all_test,
                                        five_all_tokens)

#compare adam to madgrad
bin_all_adam_model, summary_df = single_train_test_loop(summary_df, 'allcats', 1, 'adam', \
                                        bin_all_train_iter, bin_all_test_iter,
                                        bin_all_tokens)


five_all_adam_model, summary_df = single_train_test_loop(summary_df, 'allcats', 5, 'adam', \
                                        five_all_train, five_all_test, 
                                        five_all_tokens)

#summary_df.to_csv('/content/drive/Shareddrives/519 Project/Data/Final Results/CNN_comparison_results.csv')