**Project NL3.14**

In [1]:
# Mount to Google Drive 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
# Imports We need in the project
import os
import re
import torch
import glob
import string
import math
import random
import pickle
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import zipfile
import seaborn as sbr
import matplotlib.pyplot as plt 
import pandas as pd
import functools
from datetime import datetime
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import ast

SEED = 147
torch.manual_seed(SEED)
torch.__version__

'1.7.0+cu101'

In [3]:
!pip install --upgrade pip
!pip install transformers
from transformers import BertTokenizer, BertModel, AutoTokenizer, AdamW, BertForMaskedLM

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/54/eb/4a3642e971f404d69d4f6fa3885559d67562801b99d7592487f1ecc4e017/pip-20.3.3-py2.py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 25.6MB/s eta 0:00:01[K     |▍                               | 20kB 31.1MB/s eta 0:00:01[K     |▋                               | 30kB 28.5MB/s eta 0:00:01[K     |▉                               | 40kB 32.5MB/s eta 0:00:01[K     |█                               | 51kB 29.3MB/s eta 0:00:01[K     |█▎                              | 61kB 32.1MB/s eta 0:00:01[K     |█▌                              | 71kB 19.8MB/s eta 0:00:01[K     |█▊                              | 81kB 21.3MB/s eta 0:00:01[K     |██                              | 92kB 20.1MB/s eta 0:00:01[K     |██▏                             | 102kB 20.8MB/s eta 0:00:01[K     |██▍                             | 112kB 20.8MB/s eta 0:00:01[K     |██▋                             | 122kB 20.8MB/

In [4]:
# create GPU(cuda) device 
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

**LOAD ALREADY CONFIGURED DATA**

In [5]:
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile

# load trained word2vec from drive 
fname = get_tmpfile("/content/gdrive/My Drive/NL3.14/resources/word2vec.model")
w2v = Word2Vec.load(fname)

In [6]:
# read configured csv files from drive
total_df = pd.read_csv("/content/gdrive/My Drive/NL3.14/resources/geosentences.csv")
final_df = pd.read_csv("/content/gdrive/My Drive/NL3.14/resources/train.csv")

In [7]:
# split the given data as for train, as for validation.
bert_train, bert_validation = train_test_split(total_df, test_size=0.05)
train, validation_test = train_test_split(final_df, test_size=0.05)
validation, test = train_test_split(validation_test, test_size=0.05)

In [8]:
train.head()

Unnamed: 0,x,y
103473,"['კი', 'არ', 'გამარტყამს']",ახლა
1042978,"['არც', 'კი', 'ამიხსნა', ',', 'რას']",გულისხმობდა
141033,"[',', 'თითქოს', 'სადაცაა']",სულს
695754,"['ისინი', 'მიმართავენ', 'ჩვეულებრივ', 'მკურნალ...",ეს
885723,"['თუ', 'რამდენად', 'მოსწონდა', 'ან', 'არ', 'მო...",სტუმრის


In [9]:
# set up dimensions
embed_dim = 300
vocab_size = w2v.wv.vectors.shape[0]
max_seq_len = 10
batch_size = 32

In [10]:
# Helper Class for BertModel
class DatasetTrain(torch.utils.data.Dataset):
  def __init__(self, txts):
        self.txts = txts

  def __len__(self):
        return len(self.txts)

  def __getitem__(self, index):
        sentence = self.txts[index]
        splits = sentence.split(' ')
        chosenWords = random.choices(splits, k = len(splits) // 10 + 1)
        for idx, word in enumerate(splits):
          if word in chosenWords:
            splits[idx] = '[MASK]'
        embedX = tokenizer(' '.join(splits),  padding='max_length', return_tensors='pt', max_length=max_seq_len, truncation=True)
        embedY = tokenizer(sentence, padding='max_length', return_tensors='pt', max_length=max_seq_len, truncation=True)
        return embedX['input_ids'].to(device),embedX['token_type_ids'].to(device), embedX['attention_mask'].to(device), embedY['input_ids'].to(device)

In [11]:
# Parameters
train_params = {'batch_size': batch_size,
          'shuffle': True
         }
val_params = {'batch_size': batch_size
         }

# Dataloaders for train, validation and test
bert_training_set = DatasetTrain(bert_train['sentences'].tolist())
bert_training_generator = torch.utils.data.DataLoader(bert_training_set, **train_params)

bert_validation_set = DatasetTrain(bert_validation['sentences'].tolist())
bert_validation_generator = torch.utils.data.DataLoader(bert_validation_set, **val_params)

In [None]:
# load multilingual bert pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased').to(device)
model.load_state_dict(torch.load('/content/gdrive/My Drive/NL3.14/resources/bert_model'))
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

#Calculate loss on validation data
def bert_valid_loss(model, dl):
  #Switch model to evaluation mode and then back to train
  model.eval()
  with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
    loss = 0
    batches = math.ceil(len(dl) / batch_size)
    for x_input_ids, x_token_type_ids, x_attention_mask, y_input_ids in bert_validation_generator:
        output = model(input_ids=x_input_ids.squeeze(1),
                     token_type_ids=x_token_type_ids.squeeze(1),
                     attention_mask=x_attention_mask.squeeze(1),
                     labels=y_input_ids,
                    )
        loss += output.loss.item()
  model.train()
  return loss / batches

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from datetime import datetime
# Switch model to train mode
model.train()

validation_loss = None
for epoch in range(1, 2):
    epoch_loss = 0
    epoch_acc = 0
    epoch_f_score = 0
    batches = math.ceil(len(train) / batch_size)
    print(batches, "batches")
    t = datetime.now()
    i = 0
    for x_input_ids, x_token_type_ids, x_attention_mask, y_input_ids in bert_training_generator:
        optimizer.zero_grad()
        #forward
        # Model calculates loss and also outputs classification scores, which need to go through softmax later
        output = model(input_ids=x_input_ids.squeeze(1),
                     token_type_ids=x_token_type_ids.squeeze(1),
                     attention_mask=x_attention_mask.squeeze(1),
                     labels=y_input_ids,
                    )

        #Back propagation
        output.loss.backward()
        
        #Gradient step
        optimizer.step()

        epoch_loss += output.loss.item()
        
        if (i + 1) % 1000 == 0:
            print(datetime.now() - t)
            dev_loss = bert_valid_loss(model, bert_validation)
            if validation_loss is None or validation_loss > dev_loss:
                validation_loss = dev_loss
                # Save best model
                torch.save(model.state_dict(), '/content/gdrive/My Drive/NL3.14/resources/bert_model') 
            print(f'Epoch {epoch} batch {i} | Avg Train Loss: {epoch_loss/(i + 1):.6f} | Current Dev Loss {dev_loss:.6f} | Minimal Dev Loss {validation_loss:.6f}')
        i+=1
    print(f'Epoch {epoch} | Avg Train Loss: {epoch_loss/batches:.6f} ')  

In [12]:
batch_size = 256
train_params = {'batch_size': batch_size,
          'shuffle': True
}
# Helper Class For LstmModel
class PredictionDatasetTrain(torch.utils.data.Dataset):
  def __init__(self, x, y):
        self.x = x
        self.y = y

  def __len__(self):
        return len(self.y)

  def __getitem__(self, index):
        def toEmbed(word):
          try:
            return w2v.wv.vocab[word].index
          except:
            try:
              return w2v.wv.vocab[w2v.wv.most_similar(word)[0][0]].index
            except:
              return random.randint(0, len(w2v.wv.vectors) - 1)
        x = [toEmbed(i) for i in self.x[index]]
        x = (x + [0]*max_seq_len)[:max_seq_len]
        y = torch.LongTensor([w2v.wv.vocab[self.y[index]].index])
        return torch.LongTensor(x).to(device), y.to(device)

# Dataloaders for train, validation
trainX = [ast.literal_eval(i) for i in train['x'].tolist()]
validX = [ast.literal_eval(i) for i in validation['x'].tolist()]
testX = [ast.literal_eval(i) for i in test['x'].tolist()]

next_training_set = PredictionDatasetTrain(trainX, train['y'].tolist())
next_training_generator = torch.utils.data.DataLoader(next_training_set, **train_params)

next_validation_set = PredictionDatasetTrain(validX, validation['y'].tolist())
next_validation_generator = torch.utils.data.DataLoader(next_validation_set, **train_params)

test_set = PredictionDatasetTrain(testX, test['y'].tolist())
test_set_generator = torch.utils.data.DataLoader(test_set, **train_params)

In [13]:
class PredictionModel(nn.Module):
    def __init__(self, emb_dim, hid_dim, vocab_size):
        super().__init__()
        
        self.hidden_dim = hid_dim
        self.emb_dim = embed_dim
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(w2v.wv.vectors), padding_idx=0)

        self.lstm = nn.LSTM(self.emb_dim, self.hidden_dim, dropout=0.2, num_layers=2, bidirectional=True, batch_first=True)
       
        self.classifier = nn.Linear(4 * hid_dim, vocab_size)
              
    def forward(self, src):
        embedded = self.embedding(src)
        _, (hidden1, _) = self.lstm(embedded)
        hidden1 = torch.cat((hidden1[0], hidden1[1], hidden1[2], hidden1[3]), dim=1)
        return self.classifier(hidden1)
    
    def persistEmbedWeights(self):
      #we wanted but ver vqenit
      pickle.dump( self.embedding.weight.cpu().detach().numpy(), open("/content/gdrive/My Drive/NL3.14/resources/finedEmbeds.data", "wb" ))  

pred_model = PredictionModel(embed_dim, 128, vocab_size).to(device)
pred_model.load_state_dict(torch.load('/content/gdrive/My Drive/NL3.14/resources/prediction_model'))
pred_model

PredictionModel(
  (embedding): Embedding(145271, 300, padding_idx=0)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (classifier): Linear(in_features=512, out_features=145271, bias=True)
)

In [14]:
criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = AdamW(pred_model.parameters(), lr=1e-3, eps=1e-7)
lr_scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5, min_lr=1e-9)

#Calculate loss on validation data
def valid_loss(model, dl, generator):
  #Switch model to evaluation mode and then back to train
  model.eval()
  loss = 0
  with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
    batches = math.ceil(len(dl) / batch_size)
    for x, y in generator:
        preds = model(x)
        loss += criterion(preds.squeeze(0), y.squeeze(1)).item()
  model.train()
  return loss / batches

In [None]:
# Switch model to train mode
pred_model.train()

validation_loss = None
for epoch in range(1, 10):
    batch_loss = 0
    epoch_acc = 0
    epoch_f_score = 0
    batches = math.ceil(len(train) / batch_size)
    print(batches, "batches")
    t = datetime.now()
    i = 0
    for x, y in next_training_generator:
        i+=1
        
        optimizer.zero_grad()
        #forward
        output = pred_model(x)
        
        loss = criterion(output.squeeze(0), y.squeeze(1))

        #Back propagation
        loss.backward()
        
        #Gradient step
        optimizer.step()

        batch_loss += loss.item()
        
        if i % 200 == 0:
            dev_loss = valid_loss(pred_model, validation, next_validation_generator)
            if validation_loss is None or validation_loss > dev_loss:
                validation_loss = dev_loss
                # Save best model
                torch.save(pred_model.state_dict(), '/content/gdrive/My Drive/NL3.14/resources/prediction_model') 

            lr_scheduler.step(validation_loss)
            curr_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            print(f'Epoch {epoch} batch {i} | Batch Train Loss: {batch_loss/200:.6f} | Current validation Loss {dev_loss:.6f}| Minimal validation Loss {validation_loss:.6f} | Current Validation Perplexity {torch.exp(torch.tensor(dev_loss)):.6f} | Time passed {datetime.now() - t} | Current LR {curr_lr}')
            batch_loss = 0

pred_model.persistEmbedWeights()

**LSTM Perplexity on test set**

In [15]:
print(f'LSTM Perplexity on test set : {torch.exp(torch.tensor(valid_loss(pred_model, test, test_set_generator)))}')

Perplexity on test set: 1907.0517578125


In [31]:
batch_size = 1

train_params = {'batch_size': batch_size,
          'shuffle': True
}

next_training_set = PredictionDatasetTrain(trainX, train['y'].tolist())
next_training_generator = torch.utils.data.DataLoader(next_training_set, **train_params)

next_validation_set = PredictionDatasetTrain(validX, validation['y'].tolist())
next_validation_generator = torch.utils.data.DataLoader(next_validation_set, **train_params)

test_set = PredictionDatasetTrain(testX, test['y'].tolist())
test_set_generator = torch.utils.data.DataLoader(test_set, **train_params)

# Base Line Model
class NGramLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLM, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(torch.tensor(w2v.wv.vectors), padding_idx=0)
        self.linear1 = nn.Linear(context_size * embedding_dim, 500)
        self.linear2 = nn.Linear(500, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = nn.functional.relu(self.linear1(embeds))
        out = self.linear2(out)
        
        return out

baseline_model = NGramLM(vocab_size, 300, 10).to(device)
baseline_model.load_state_dict(torch.load('/content/gdrive/My Drive/NL3.14/resources/baseline_model'))
baseline_model

NGramLM(
  (embeddings): Embedding(145271, 300, padding_idx=0)
  (linear1): Linear(in_features=3000, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=145271, bias=True)
)

In [34]:
criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = AdamW(baseline_model.parameters(), lr=1e-3, eps=1e-7)
lr_scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5, min_lr=1e-9)


#Calculate loss on validation data
def ngram_valid_loss(model, dl, generator):
  #Switch model to evaluation mode and then back to train
  model.eval()
  loss = 0
  with torch.no_grad(): # tells Pytorch not to store values of intermediate computations for backward pass because we not gonna need gradients.
    batches = math.ceil(len(dl) / batch_size)
    for x, y in generator:
        preds = model(x)
        loss += criterion(preds, y.squeeze(1)).item()
  model.train()
  return loss / batches

In [None]:
# Switch model to train mode
baseline_model.train()

validation_loss = None
for epoch in range(1, 50):
    batch_loss = 0
    epoch_acc = 0
    epoch_f_score = 0
    batches = math.ceil(len(train) / batch_size)
    print(batches, "batches")
    t = datetime.now()
    i = 0
    for x, y in next_training_generator:
        i+=1
        optimizer.zero_grad()
        output = baseline_model(x)
        loss = criterion(output, y.squeeze(1))
        loss.backward()
        optimizer.step()
        batch_loss += loss.item()
        
        if i % 50 == 0:
            dev_loss = ngram_valid_loss(baseline_model, validation, next_validation_generator)
            if validation_loss is None or validation_loss > dev_loss:
                validation_loss = dev_loss
                # Save best model
                torch.save(baseline_model.state_dict(), '/content/gdrive/My Drive/NL3.14/resources/baseline_model') 

            lr_scheduler.step(validation_loss)
            curr_lr = [param_group['lr'] for param_group in optimizer.param_groups][0]
            print(f'Epoch {epoch} batch {i} | Batch Train Loss: {batch_loss/50:.6f} | Current validation Loss {dev_loss:.6f}| Minimal validation Loss {validation_loss:.6f} | Current Validation Perplexity {torch.exp(torch.tensor(dev_loss)):.6f} | Time passed {datetime.now() - t} | Current LR {curr_lr}')
            batch_loss = 0

**Linear Test Set Perplexity**

In [37]:
print(f'Linear Perplexity on test set : {torch.exp(torch.tensor(ngram_valid_loss(baseline_model, test, test_set_generator)))}')

Linear Perplexity on test set : 211320.109375
