<a href="https://colab.research.google.com/github/faldund7/sentiment-model-pytorch/blob/master/Sentiment_Model_using_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==3

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 4.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 44.6 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 33.8 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.45 sentencepiece-0.1.96 tokenizers-0.8.0rc4 transformers-3.0.0


In [None]:
import pandas as pd
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import class_weight
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
df_train = pd.read_csv('train.csv', header = None)

df_train.columns = ['rating', 'review']

df_test_set = pd.read_csv('test.csv', header = None)

df_test_set.columns = ['rating', 'review']

df_train_set, df_valid_set = train_test_split(df_train, test_size = 0.2, random_state = 42)

# sns.countplot(df.rating)

class_name = ['most-negative', 'negative', 'neutral', 'positive', 'most-positive']

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#from tqdm import tqdm
#token_lens = []
#for txt in tqdm(df_train.review):
#    tokens = tokenizer.encode(txt, max_length = 512)
#    token_lens.append(len(tokens))
#print(max(token_lens))
# sns.distplot(token_lens)

MAX_LEN = 120

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [None]:
df_train

Unnamed: 0,rating,review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...
...,...,...
649995,5,I had a sprinkler that was gushing... pipe bro...
649996,1,Phone calls always go to voicemail and message...
649997,1,Looks like all of the good reviews have gone t...
649998,5,I was able to once again rely on Yelp to provi...


In [None]:
# a Dataset object loads training or test data into memory, and a DataLoader object
# fetches data from a Dataset and serves the data up in batches.
class YelpReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
                    # string
                    review,
                    # Whether or not to encode the sequences with the special tokens relative to their model.
                    add_special_tokens = True,
                    # Controls the maximum length to use by one of the truncation/padding parameters.
                    max_length = self.max_len,
                    # These require two different sequences to be joined in a 
                    # single “input_ids” entry, which usually is performed with the help of special tokens, 
                    # such as the classifier ([CLS]) and separator ([SEP]) tokens.
                    return_token_type_ids = False,
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    # If set, will return tensors instead of list of python integers.
                    # 'pt': Return PyTorch torch.Tensor objects.
                    return_tensors = 'pt'
                    )
        # A BatchEncoding with the fields:
        return {
                'review_text' : review,
                'input_ids' : encoding['input_ids'].flatten(),
                'attention_mask' : encoding['attention_mask'].flatten(),
                'targets' : torch.tensor(target, dtype=torch.long)
            }
        
# print(df_train_set.shape)
# print(df_valid_set.shape)
# print(df_test_set.shape)

# Checking the uniform distribution of a dataset
# class_weights = class_weight.compute_class_weight('balanced',
#                                                    np.unique(df_train.rating.values),
#                                                    df_train.rating.values)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    reviewDataset = YelpReviewDataset(
                    # Convert the DataFrame (df-column in this case) to a NumPy array.
                    reviews = df.review.to_numpy(),
                    targets = df.rating.to_numpy(),
                    tokenizer = tokenizer,
                    max_len = max_len
                    )
    # DataLoader represents a Python iterable over a dataset
    return DataLoader(reviewDataset,
                        batch_size = batch_size,
                        num_workers = 4)
                        
BATCH_SIZE = 32
train_data_loader = create_data_loader(df_train_set, tokenizer, MAX_LEN, BATCH_SIZE)
valid_data_loader = create_data_loader(df_valid_set, tokenizer, MAX_LEN, BATCH_SIZE)
test_valid_loader = create_data_loader(df_test_set, tokenizer, MAX_LEN, BATCH_SIZE)

# bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

  cpuset_checked))


In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p = 0.4)
        self.out1 = nn.Linear(self.bert.config.hidden_size, 128)
        self.drop1 = nn.Dropout(p = 0.4)
        self.relu = nn.ReLU()
        self.out = nn.Linear(128, n_classes)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
                                    input_ids = input_ids,
                                    attention_mask = attention_mask
                                    )
        output = self.drop(pooled_output)
        output = self.out1(output)
        output = self.relu(output)
        output = self.drop1(output)
        return self.out(output)
        
model = SentimentClassifier(len(class_name)+1)
model = model.to(device)

EPOCHS = 5
optimizer = AdamW(model.parameters(), lr = 2e-5, correct_bias = False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
                                            optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps
                                            )
                                            
loss_fn = nn.CrossEntropyLoss().to(device)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    # Set the module in training mode
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        targets = data['targets'].to(device)
        
        # Although the recipe for forward pass needs to be defined within this function, one should call the
        #Module instance afterwards instead of this since the former takes care of running the registered hooks
        #while the latter silently ignores them.
        outputs = model(
                        input_ids = input_ids,
                        attention_mask = attention_mask
                        )
                        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)
    

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask
            )
            _, preds = torch.max(outputs)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train_set)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_model(
        model,
        valid_data_loader,
        loss_fn,
        device,
        len(df_valid_set)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

#plt.plot(history['train_acc'], label='train accuracy')
#plt.plot(history['val_acc'], label='validation accuracy')
#plt.title('Training history')
#plt.ylabel('Accuracy')
#plt.xlabel('Epoch')
#plt.legend()
#plt.ylim([0, 1]);

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
            
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

print(classification_report(y_test, y_pred, target_names=class_name))

review_text = "the food was delicious but it was spicy"

encoded_review = tokenizer.encode_plus(
  review_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Review text: {review_text}')
print(f'Sentiment  : {class_name[prediction]}')