In [4]:
DATA_DOWNLOADED = True
DFS_PICKLED = True

GET_SIM = False # get domain similarity

# set up testing (as in not running with the full dataset)
TEST = True # to test set up
# set size for the sets
TRAIN_SIZE = 3000
DEV_SIZE = 500
TEST_SIZE = 1000

# train settings
MODEL_TRAINED = True
EPOCHS = 4
SAVE_MODEL_AS = 'baseline_3000_music'

# BERT_MODEL = 'distilbert-base-uncased'

In [5]:
import torch

# check if cuda is available
if torch.cuda.is_available():       
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


# Sentiment analysis
Using sample selection based on similarity to train a neural net to predict sentiment.

## Data
The data used for this project come from Amazon reviews ([source](https://nijianmo.github.io/amazon/index.html))

In [6]:
if not DATA_DOWNLOADED:
    !wget -P ../data http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz
    !wget -P ../data http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
    !wget -P ../data http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz

### Data pre processing
- read json.gz file
- Remove 3 star reviews
- map sentiment to star reviews (1, 2 = negative and 4,5 = positive)
- concatenate review title and review text
- select only relevant columns (concat review and sentiment)
- remove duplicates
- pickle dataframe 

In [7]:
import utils

if not DFS_PICKLED:
    utils.pre_process('../data/Digital_Music_5.json.gz', 'music')
    utils.pre_process('../data/Video_Games_5.json.gz', 'games')
    utils.pre_process('../data/Arts_Crafts_and_Sewing_5.json.gz', 'art')

### Load dataframes from pickle

In [8]:
import pandas as pd

df_music = pd.read_pickle('../data/pickled_dfs/df_music.pkl')  
df_games = pd.read_pickle('../data/pickled_dfs/df_games.pkl')  
df_art = pd.read_pickle('../data/pickled_dfs/df_art.pkl')  

### Compare domains
Concat all reviews into a big string, and compare the strings to find the cosine similarity.

tfidf score of a word w is `tf(w)*idf(w)`  
Where, tf(w) = Number of times the word appears in a document/Total number of words in the document
and idf(w) = Number of documents/Number of documents that contains word w ([source](https://kanoki.org/2018/12/27/text-matching-cosine-similarity/))

In [9]:
def make_big_string(df, n):
    '''
    input: dataframe with reviews and number of reviews to compare
    output: all values in the column concatenated
    '''
    # n is the number of reviews we want to compare
    big_string = ' '.join(df.iloc[:n,0].astype(str))
    return big_string.lower()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_df(str1, str2):
    '''
    input: 2 strings
    output: cosine similarity and dataframe with tfidf score for each word
    '''
    corpus = [str1, str2]

    # tokanise -> remove strop words, select only words (ignore punctuation, digits, etc)
    vectorizer = TfidfVectorizer(stop_words='english', token_pattern='[a-z]\w+')
    trsfm = vectorizer.fit_transform(corpus)

    return cosine_similarity(trsfm[0:1], trsfm)[0][1], pd.DataFrame(trsfm.toarray(),columns=vectorizer.get_feature_names_out(),index=['str1','str2'])

In [10]:
len(df_music), len(df_games), len(df_art)

(95621, 364933, 339610)

In [11]:
if GET_SIM:
    cs_music_games, df_music_games = cos_sim_df(make_big_string(df_music, 95621), make_big_string(df_games, 364933))
    cs_music_art, df_music_art = cos_sim_df(make_big_string(df_music, 95621), make_big_string(df_art, 339610))
    print(cs_music_games, cs_music_art)

## Split datasets
Into train, val and test sets.

In [12]:
from sklearn.model_selection import train_test_split

def split_dataset(df, random_state=42):
    X, y = df['rev_sum'], df['sentiment']

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, train_size = 0.8, stratify=y, random_state=42)
    X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, train_size = 0.5, stratify=y_temp, random_state=42)

    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [13]:
X_train, y_train, X_dev, y_dev, X_test, y_test = split_dataset(df_music)

In [14]:
len(X_train)

76496

In [15]:
if TEST:
    X_train = X_train[:TRAIN_SIZE]
    y_train = y_train[:TRAIN_SIZE]
    
    X_dev = X_dev[:DEV_SIZE]
    y_dev = y_dev[:DEV_SIZE]

    X_test = X_test[:TEST_SIZE]
    y_test = y_test[:TEST_SIZE]

### Check label distribution

In [16]:
def get_value_count(X, y):
    frame = { 'review': X, 'ground_truth': y }  
    df_train = pd.DataFrame(frame)
    print(df_train.ground_truth.value_counts())

In [17]:
get_value_count(X_train, y_train)
get_value_count(X_dev, y_dev)
get_value_count(X_test, y_test)

1    2906
0      94
Name: ground_truth, dtype: int64
1    483
0     17
Name: ground_truth, dtype: int64
1    968
0     32
Name: ground_truth, dtype: int64


# BERT
Imports and function definitions.

In [18]:
import numpy as np
import torch
import random
import time
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup

# create a function to tokenize a set of texts
def preprocessing_for_bert(data, MAX_LEN=512):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            padding='max_length',           # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,     # Return attention mask
            truncation = True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

# trainer
# create the BertClassifier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a DistilBertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size, max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size, num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

def initialize_model(step, epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    if torch.cuda.is_available():       
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = step * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """
    Train the BertClassifier model.
    """
    loss_fn = nn.CrossEntropyLoss()

    if torch.cuda.is_available():       
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
        
    # Start training loop
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()
        
        # For each batch of training data...
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                #print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

def evaluate(model, val_dataloader):
    """
    After the completion of each training epoch, measure the model's performance
    on our validation set.
    """

    # Tell PyTorch to run the model on GPU
    if torch.cuda.is_available():       
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    loss_fn = nn.CrossEntropyLoss()

    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


## BERT pre-processing
Getting inputs for the model and masks for train and validation sets.

In [19]:
# # use to reload if making changes to the imported script 
# # without needing to restart the kernel
# import importlib
# importlib.reload(bert)

In [20]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pickle

MODEL_DIR = '../artifacts/models/'
FILE_PATH = MODEL_DIR + SAVE_MODEL_AS

if not MODEL_TRAINED:
    train_inputs, train_masks = preprocessing_for_bert(X_train)
    val_inputs, val_masks = preprocessing_for_bert(X_dev)
    print('Pre-processing for BERT completed.')
    
    # Convert to torch.tensor
    train_labels = torch.tensor(y_train.to_numpy())
    val_labels = torch.tensor(y_dev.to_numpy())

    # Set batch size. 2 is about the highest that will run on a laptop for testing. 16 or 32 might work on HPC?
    BATCH_SIZE = 2

    # create the DataLoader for training set
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
    print('DataLoader for training set created.')

    # create the DataLoader for validation set
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)
    print('DataLoader for validation set created.')

    # set seed for reproducibility
    set_seed(42)

    # initialise model
    bert_classifier, optimizer, scheduler = initialize_model(int(len(train_dataloader)), epochs=2)
    print('Model initialised.')
    # train model
    train(bert_classifier, optimizer, scheduler, train_dataloader, val_dataloader, epochs=EPOCHS, evaluation=True)
    print('Training finalised.')
    # pickle trained model
    pickle.dump(bert_classifier, open('{}.pkl'.format(FILE_PATH), 'wb'))


## Predict

In [21]:
bert_classifier = pickle.load(open('../../code/models/model_base.pkl', 'rb'))

In [22]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in tqdm(test_dataloader):
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [23]:
# just checking
len(X_test)

1000

In [24]:
test_inputs, test_masks = preprocessing_for_bert(X_test)

BATCH_SIZE = 2

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [25]:
# compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader)

100%|██████████| 500/500 [00:41<00:00, 11.94it/s]


In [26]:
df_results = pd.DataFrame(probs, columns = ['prob_neg','pron_pos'])
df_results['review'] = np.array(X_test)
df_results['ground_truth'] = np.array(y_test)

In [27]:
df_results['prediction'] = df_results['prob_neg'] < df_results['pron_pos']
df_results['prediction'] = df_results['prediction'].apply(lambda x: int(x))

In [30]:
df_results.ground_truth.value_counts()

1    968
0     32
Name: ground_truth, dtype: int64

In [31]:
df_results.prediction.value_counts()

1    898
0    102
Name: prediction, dtype: int64

In [32]:
df_results

Unnamed: 0,prob_neg,pron_pos,review,ground_truth,prediction
0,0.003507,0.996493,"Five Stars Good tune, and very cool",1,1
1,0.003507,0.996493,Five Stars It is amazing!!!,1,1
2,0.996729,0.003271,"okay but not great album, yet so much better l...",1,0
3,0.957647,0.042353,"Heard this song before, but... ...oh, Gob. To...",1,0
4,0.003506,0.996494,Vintage Beasties. Heavy guitars and Beastie l...,1,1
...,...,...,...,...,...
995,0.996270,0.003730,"i bought this song i bought, purchased, aquire...",1,0
996,0.003507,0.996493,Saturday In The Park This song is a classic! L...,1,1
997,0.003507,0.996493,Five Stars Great song to dance to.,1,1
998,0.003507,0.996493,"Five Stars Great music,,,",1,1


In [35]:
from sklearn.metrics import classification_report
target_names = ['negative', 'positive']
print(classification_report(df_results.ground_truth, df_results.prediction, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.28      0.91      0.43        32
    positive       1.00      0.92      0.96       968

    accuracy                           0.92      1000
   macro avg       0.64      0.92      0.70      1000
weighted avg       0.97      0.92      0.94      1000

