In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from torch.optim import AdamW

In [19]:
starfield_reviews_df = pd.read_csv('starfield_reviews.csv')

In [20]:
starfield_reviews_df.dropna(inplace=True)
starfield_reviews_df.reset_index(drop=True, inplace=True)

In [21]:
starfield_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42524 entries, 0 to 42523
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review_id  42524 non-null  int64 
 1   text       42524 non-null  object
 2   target     42524 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 996.8+ KB


In [22]:
starfield_reviews_df.head()

Unnamed: 0,review_id,text,target
0,147134704,Uninstalled until they fix and improve this ga...,0
1,147134681,badass,1
2,147134591,TL:DR Not worth the whole price. It's a basic ...,0
3,147134557,Only a few hours in- LOVE it. Yeah maybe a few...,1
4,147134157,Great overall game. Some systems are flawed or...,1


In [23]:
starfield_reviews_df.rename(columns={'text':'review', 'target':'label'}, inplace=True)

In [17]:
starfield_reviews_df.to_csv('starfield_reviews.csv', index=False)

In [25]:
starfield_reviews_df.head()


Unnamed: 0,review_id,review,label
0,147134704,Uninstalled until they fix and improve this ga...,0
1,147134681,badass,1
2,147134591,TL:DR Not worth the whole price. It's a basic ...,0
3,147134557,Only a few hours in- LOVE it. Yeah maybe a few...,1
4,147134157,Great overall game. Some systems are flawed or...,1


In [26]:
train_data, remaining = train_test_split(starfield_reviews_df, test_size=0.3, random_state=42, stratify=starfield_reviews_df['label'])
val_data, test_data = train_test_split(remaining, test_size=0.5, random_state=42, stratify=remaining['label'])

In [27]:
train_data.head()

Unnamed: 0,review_id,review,label
39931,145746371,I am never one to say that something gets bett...,1
9419,146387827,10/10 Bethesda experience\n\nI have lost many ...,1
24003,145926861,Great game. Enough said.,1
5659,146641520,I’m not a big single player gamer but after gi...,1
12233,146255526,It's good ya'll,1


In [28]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [30]:
X_train_tokenized = tokenizer.batch_encode_plus(
    train_data['review'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

X_test_tokenized = tokenizer.batch_encode_plus(
    test_data['review'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

X_val_tokenized = tokenizer.batch_encode_plus(
    val_data['review'].values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)



In [34]:
input_ids_train = X_train_tokenized['input_ids']
attention_masks_train = X_train_tokenized['attention_mask']
labels_train = torch.tensor(train_data['label'].values) 

input_ids_test = X_test_tokenized['input_ids']
attention_masks_test = X_test_tokenized['attention_mask']
labels_test = torch.tensor(test_data['label'].values)

input_ids_val = X_val_tokenized['input_ids']
attention_masks_val = X_val_tokenized['attention_mask']
labels_val = torch.tensor(val_data['label'].values)

In [35]:
train_dataset = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

val_dataset = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [36]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 2,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [39]:
batch_size = 4

dataloader_train = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    val_dataset,
    sampler=RandomSampler(val_dataset),
    batch_size=32
)

In [40]:
from transformers import get_linear_schedule_with_warmup

In [41]:
epochs = 5
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

In [42]:
def evaluate(dataloader):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)            

    return loss_val_avg, predictions, true_vals

In [43]:

def train_model(model, dataloader_train, dataloader_val, optimizer, scheduler, num_epochs):
    train_loss = []
    validation_loss = []

    for epoch in tqdm(range(1, (num_epochs + 1))):
        model.train()
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train,
                            desc='Epoch {:1d}'.format(epoch),
                            leave=False,
                            disable=False,
                            colour='green')

        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }

            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

        tqdm.write(f'Epoch {epoch}')

        loss_train_avg = loss_train_total / len(dataloader_train)
        train_loss.append(loss_train_avg)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_val)
        validation_loss.append(val_loss)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (weighted): {val_f1}')
    
    return train_loss, validation_loss

In [44]:
train_loss, validation_loss = train_model(model, dataloader_train, dataloader_val, optimizer, scheduler, epochs)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/7442 [00:00<?, ?it/s]

Epoch 1
Training loss: 0.3134714565553782


  0%|          | 0/200 [00:00<?, ?it/s]

Validation loss: 0.23272787335561587
F1 Score (weighted): 0.9392723541191501


Epoch 2:   0%|          | 0/7442 [00:00<?, ?it/s]

Epoch 2
Training loss: 0.18459153992715255


  0%|          | 0/200 [00:00<?, ?it/s]

Validation loss: 0.30867200966495145
F1 Score (weighted): 0.9399751464183129


Epoch 3:   0%|          | 0/7442 [00:00<?, ?it/s]

Epoch 3
Training loss: 0.09309864852324502


  0%|          | 0/200 [00:00<?, ?it/s]

Validation loss: 0.38326368624329915
F1 Score (weighted): 0.9373250635766052


Epoch 4:   0%|          | 0/7442 [00:00<?, ?it/s]

Epoch 4
Training loss: 0.040953713581825875


  0%|          | 0/200 [00:00<?, ?it/s]

Validation loss: 0.4487353418384555
F1 Score (weighted): 0.9392155398255458


Epoch 5:   0%|          | 0/7442 [00:00<?, ?it/s]

Epoch 5
Training loss: 0.020687431471877255


  0%|          | 0/200 [00:00<?, ?it/s]

Validation loss: 0.5009677468402515
F1 Score (weighted): 0.9367940312034984
