In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data

In [3]:
# set random seed
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load data

In [4]:
def load_data(path):
  df = pd.read_csv(path, delimiter='\t', header=None)
  df = df.rename(columns={
    0:'polarity',
    1:'aspect_cat',
    2:'target_term',
    3:'char_offset',
    4:'sentence'
})
  return df

In [5]:
train = load_data('/content/drive/MyDrive/Colab Notebooks/M2_NLP/exercise2/traindata.csv')
val = load_data('/content/drive/MyDrive/Colab Notebooks/M2_NLP/exercise2/devdata.csv')

# Prepare data

## TextProcessor

In [6]:
import string
from typing import Text
import re
import nltk
import spacy 
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
import sklearn 


class TextProcessor:

    def __init__(self,text):
        self.text = text
        self.stopwords = stopwords.words('english')
        #Keeping the special characters, we re-format the punctuation
        self.special_chars = re.compile('[{}]'.format(re.escape(string.punctuation)))
        self.nlp = spacy.load("en_core_web_sm")
        self.processed_text = []

    def _iterator(self):
        for text in self.text:
            yield text

    def Preprocess_Text(self):
        for text in self._iterator():
            #Remove numbers from the string 
            cleaned_text = re.sub(r'\d+', '', text)

            #Token creation & Lemmatization
            doc = self.nlp(cleaned_text)
            tokens = [token.lemma_ for token in doc]
            tokens = [token.strip().lower() for token in tokens]

            #Stopword & Punctuation Removal 
            cleaned_tokens = [token for token in tokens if token not in self.stopwords]
            token_filters = filter(None,[self.special_chars.sub(' ', token) for token in cleaned_tokens])
            new_text = ' '.join(token_filters)

            #Whitespace removal between pre-existing punctuations & stopwords 
            new_text = " ".join(new_text.split())
            self.processed_text.append(new_text)
        return self.processed_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Preprocess data

In [7]:
def preprocess(df):
  tp = TextProcessor(df['sentence'])
  df['sentence'] = tp.Preprocess_Text()
  df['inputs'] = df['aspect_cat'] + " " + \
                  df['target_term'] + " " + \
                  df['sentence']

  return df[['polarity', 'inputs']] 


In [8]:
train_cleaned = preprocess(train)
val_cleaned = preprocess(val)

In [9]:
train_cleaned

Unnamed: 0,polarity,inputs
0,positive,AMBIENCE#GENERAL seating short sweet – seating...
1,positive,AMBIENCE#GENERAL trattoria quaint romantic tra...
2,positive,FOOD#QUALITY food different beer offer thier g...
3,negative,SERVICE#GENERAL STAFF staff fired
4,positive,FOOD#STYLE_OPTIONS menu menu look great waiter...
...,...,...
1498,positive,DRINKS#QUALITY expresso one pron actually like...
1499,negative,SERVICE#GENERAL waitress hostess waitress incr...
1500,positive,RESTAURANT#PRICES place little place cute inte...
1501,positive,RESTAURANT#GENERAL restaurant nice family trad...


## GloVe embeddings

In [11]:
TEXT = data.Field(tokenize = 'spacy', 
                  tokenizer_language = 'en_core_web_sm',
                  batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

In [12]:
class DataFrameDataset(data.Dataset):
  '''
  credit: https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
  '''

  def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
      fields = [('text', text_field), ('label', label_field)]
      examples = []
      for i, row in df.iterrows():
          label = row.polarity if not is_test else None
          text = row.inputs
          examples.append(data.Example.fromlist([text, label], fields))

      super().__init__(examples, fields, **kwargs)

  @staticmethod
  def sort_key(ex):
      return len(ex.text)

  @classmethod
  def splits(cls, text_field, label_field, train_df, val_df=None, test_df=None, **kwargs):
      train_data, val_data, test_data = (None, None, None)

      if train_df is not None:
          train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
      if val_df is not None:
          val_data = cls(val_df.copy(), text_field, label_field, **kwargs)
      if test_df is not None:
          test_data = cls(test_df.copy(), text_field, label_field, True, **kwargs)

      return tuple(d for d in (train_data, val_data, test_data) if d is not None)

In [13]:
train_ds, val_ds = DataFrameDataset.splits(
  text_field=TEXT, label_field=LABEL, 
  train_df=train_cleaned, val_df=val_cleaned)

In [14]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_ds)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399999/400000 [00:18<00:00, 21901.00it/s]


In [15]:
len(TEXT.vocab)

2385

In [50]:
print(LABEL.vocab.stoi)

defaultdict(None, {'positive': 0, 'negative': 1, 'neutral': 2})


## BertTokenizer 

In [10]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 3.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 40.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

In [13]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [25]:
TEXT_FIELD = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL_FIELD = data.LabelField(dtype = torch.float)

In [16]:
train_ds, test_ds = DataFrameDataset.splits(
  text_field=TEXT_FIELD, label_field=LABEL_FIELD, 
  train_df=train_cleaned, test_df=test_cleaned)

In [17]:
print(vars(train_ds[0]))
print(vars(test_ds[0]))

{'text': [2572, 11283, 5897, 1001, 2236, 10747, 2460, 4086, 1516, 10747, 2307, 4013, 2078, 6298, 26931, 2797], 'label': 'positive'}
{'text': [3295, 1001, 2236, 5101, 2307, 2833, 2307, 4511, 2862, 2307, 2326, 2307, 5101], 'label': None}


In [18]:
print(f"Number of training examples: {len(train_ds)}")
print(f"Number of testing examples: {len(test_ds)}")

Number of training examples: 1503
Number of testing examples: 376


In [19]:
# build the vocab for the lablels 

LABEL_FIELD.build_vocab(train_ds)

In [20]:
print(LABEL_FIELD.vocab.stoi)

defaultdict(None, {'positive': 0, 'negative': 1, 'neutral': 2})


## Build iterators

In [94]:
BATCH_SIZE = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, val_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE, 
    device = device)

# Build Model

In [None]:
#@title
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [17]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [132]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 200
FILTER_SIZES = [1,2,3,4]
OUTPUT_DIM = 3 # 3 labels
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [133]:
# load pre-trained embeddings

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [ 0.1134,  0.0624,  0.6683,  ...,  0.4773,  0.0420,  0.0243],
        ...,
        [ 0.1238,  0.0467,  0.1646,  ..., -0.1151,  0.2209, -0.4480],
        [ 0.1164,  0.1429, -0.1048,  ...,  0.4136, -0.1188,  0.0844],
        [ 0.4149, -0.3073, -0.3590,  ..., -0.8834,  0.5939,  0.1043]])

In [135]:
# zero the initial weights of the unknown and padding tokens

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# Train Model

In [136]:
def accuracy(output, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # #round predictions to the closest integer
    _, pred = torch.max(output.data, 1)
    correct = (pred == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [137]:
def train_model(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label.long())
        
        acc = accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [138]:
def evaluate_model(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label.long())
            
            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [139]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [140]:
LR = 1e-4
optimizer = optim.Adam(model.parameters(), lr = LR)

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [141]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_model(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate_model(model, val_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.704 | Train Acc: 70.65%
	 Val. Loss: 0.674 |  Val. Acc: 70.00%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 0.609 | Train Acc: 74.42%
	 Val. Loss: 0.640 |  Val. Acc: 74.74%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.541 | Train Acc: 77.48%
	 Val. Loss: 0.608 |  Val. Acc: 74.74%
Epoch: 04 | Epoch Time: 0m 9s
	Train Loss: 0.480 | Train Acc: 82.19%
	 Val. Loss: 0.580 |  Val. Acc: 77.63%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 0.429 | Train Acc: 84.34%
	 Val. Loss: 0.558 |  Val. Acc: 77.11%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 0.380 | Train Acc: 86.84%
	 Val. Loss: 0.549 |  Val. Acc: 78.95%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 0.330 | Train Acc: 89.17%
	 Val. Loss: 0.546 |  Val. Acc: 78.95%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 0.295 | Train Acc: 90.37%
	 Val. Loss: 0.542 |  Val. Acc: 80.26%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.268 | Train Acc: 91.96%
	 Val. Loss: 0.541 |  Val. Acc: 80.00%
Epoch: 10 | Epoch Time: 0m 7

In [108]:
def get_pred(output, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # #round predictions to the closest integer
    _, pred = torch.max(output.data, 1)
    correct = (pred == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [102]:
model.load_state_dict(torch.load('model.pt'))


<All keys matched successfully>

In [109]:
test_loss, test_acc, preds = eval_get_pred(model, val_iterator, criterion)

In [110]:
preds

tensor([[0.0271]])