In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [83]:
import os
import pandas as pd
import numpy as np

import spacy
import string

import matplotlib.pyplot as plt

import seaborn as sns
import timeit

import collections
from IPython.display import HTML, IFrame

from textblob import TextBlob
from torchtext import data
import torch

In [84]:
import random
SEED  = 2020
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [85]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [129]:
tokenizer = lambda x: str(x).translate(str.maketrans('', '', string.punctuation)).strip().split()

# Step one defination of our fields. 
ID = data.Field()
TEXT = data.Field(sequential=True, lower=True, tokenize=tokenizer,include_lengths = True)
LABEL = data.LabelField()

print("loading from csv ...")
tv_datafields = [(None,None),("text", TEXT), ("label", LABEL)]

# Step two construction our dataset.
train_data = data.TabularDataset(path='drive/MyDrive/tunguz/Train.csv',format="csv",
                                                skip_header=True, fields=tv_datafields,)
test_data = data.TabularDataset(path='drive/MyDrive/tunguz/Test.csv', format="csv",
                                                skip_header=True, fields=[('ID',ID),("text", TEXT)])
print(train_data[0].__dict__.keys())

loading from csv ...
dict_keys(['text', 'label'])


In [130]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 70000
Number of testing examples: 30000


In [131]:
print(vars(train_data.examples[2]))

{'text': ['bereau', 'degage', 'nathef', 'ya', 'slim', 'walahi', 'ya7chiw', 'fih', 'jma3a', 'lem3amel', '3lihom', 'walah', 'kit', 'jib', 'messi', 'lana3mlou', 'chay', '7amlet', 'nathafa', 'fil', 'bureaux', 'ca', 'jam3iya', '3ari9a', 'mel', '3am', '94', 'bdet', 'da5la', 'fi', '7it', 'choufelna', 'hal', 'mochkla', 'belehi', 'te5na9na', 'mel', 'fada', 'tous', 'les', 'équipe', 'mergine', 'fina', 'ken', 'jit', 'kifek', 'walah', 'maye5lsouch', 'wi3adiw', '3am', 'jaych', 'bech', 'yetrabaw', 'elkoura', 'fi', 'se9ik', 'enti', 'en9eth', 'jam3iya', 'ya', 'slim', 'wna3ref', 'tnajim', 'ta3melha'], 'label': '-1'}


In [132]:
import random

train_data, valid_data = train_data.split(split_ratio=0.9,random_state = random.seed(202))

In [133]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 63000
Number of validation examples: 7000
Number of testing examples: 30000


In [134]:
print(vars(valid_data.examples[2]))

{'text': ['billahi', 'bara', 'ara', 'imninn', 'itjiboo', 'fi', 'iflouskimm', 'ow', 'cof', '7ata', 'min', 'jimhorr', 'karhik', 'ow', 'ikrah', 'iflousik', 'ow', 'fadd', 'min', '7aja', 'manirb7oo', 'ila', 'mana3tiw', 'liflous', 'kima', 'ilyoma', 'il', 'pilanti', 'ili', 'ta3t', 'lil', 'ca', 'mafamach', 'minha', 'ow', 'zid', 'intom', 'dija', 'itla3too', 'lil', 'play', 'off', 'ila', 'bil', 'iflouss', 'bara', 'i9bal', 'ma', 'ta7ki', '3ala', 'jimhor', 'ca', 'aana', 'in7ib', 'css', 'lakin', 'jomhorr', 'ca', 'ow', 'howa', 'johorr', 'ikbirrr', 'ara', 'rou7ikk', 'kifach', 'tik4ibb', '3alihimm', 'winti', 'ow3ithimm', 'bil', 'boutoula', 'fil', 'lo5ir', 'ma9aw', 'chinn', 'ow', 'ara', 'rou7ik', 'bama', 'tari9a', 'racha7t', 'lifri9i', 'lil', 'play', 'off'], 'label': '-1'}


In [135]:
MAX_VOCAB_SIZE = 20_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
ID.build_vocab(test_data)

In [136]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f520cd5e488>, {'1': 0, '-1': 1, '0': 2})


In [137]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")


Unique tokens in TEXT vocabulary: 20002


In [138]:
print(TEXT.vocab.freqs.most_common(20))

[('w', 17581), ('ya', 8112), ('fi', 7987), ('el', 7022), ('slim', 6401), ('rabi', 5801), ('si', 3263), ('ca', 2833), ('l', 2808), ('kol', 2806), ('bech', 2695), ('bravo', 2646), ('ma', 2490), ('m3ak', 2473), ('3la', 2443), ('ken', 2408), ('la', 2350), ('allah', 2220), ('slouma', 2035), ('il', 1977)]


In [139]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'w', 'ya', 'fi', 'el', 'slim', 'rabi', 'si', 'ca']


In [140]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, = data.BucketIterator.splits(
    (train_data, valid_data), sort=False,
    batch_size = BATCH_SIZE,
    device = device)
test_iterator = data.Iterator(test_data,batch_size=BATCH_SIZE,
                              device=device,train=False,sort=False,
                              sort_within_batch=False)

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),enforce_sorted=False)
        
        packed_output, (hidden,cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [141]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        # self.rnn = nn.GRU(embedding_dim, 
        #                    hidden_dim, 
        #                    num_layers=n_layers, 
        #                    bidirectional=bidirectional, 
        #                    dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),enforce_sorted=False)
        
        packed_output, (hidden,cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [142]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [143]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,311,883 trainable parameters


In [150]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [151]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).cuda()

In [152]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, text_lengths = batch.text
                
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [153]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [154]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [155]:
N_EPOCHS = 10 #LSTM

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 29s
	Train Loss: 0.620 | Train Acc: 71.89%
	 Val. Loss: 0.575 |  Val. Acc: 74.90%
Epoch: 02 | Epoch Time: 0m 29s
	Train Loss: 0.580 | Train Acc: 74.18%
	 Val. Loss: 0.540 |  Val. Acc: 76.73%
Epoch: 03 | Epoch Time: 0m 29s
	Train Loss: 0.548 | Train Acc: 76.19%
	 Val. Loss: 0.512 |  Val. Acc: 78.13%
Epoch: 04 | Epoch Time: 0m 29s
	Train Loss: 0.520 | Train Acc: 77.48%
	 Val. Loss: 0.505 |  Val. Acc: 78.53%
Epoch: 05 | Epoch Time: 0m 29s
	Train Loss: 0.500 | Train Acc: 78.59%
	 Val. Loss: 0.498 |  Val. Acc: 79.28%
Epoch: 06 | Epoch Time: 0m 29s
	Train Loss: 0.481 | Train Acc: 79.71%
	 Val. Loss: 0.497 |  Val. Acc: 79.11%
Epoch: 07 | Epoch Time: 0m 29s
	Train Loss: 0.462 | Train Acc: 80.46%
	 Val. Loss: 0.492 |  Val. Acc: 79.54%
Epoch: 08 | Epoch Time: 0m 29s
	Train Loss: 0.447 | Train Acc: 81.31%
	 Val. Loss: 0.488 |  Val. Acc: 79.74%
Epoch: 09 | Epoch Time: 0m 29s
	Train Loss: 0.429 | Train Acc: 82.03%
	 Val. Loss: 0.503 |  Val. Acc: 79.91%
Epoch: 10 | Epoch T

In [None]:
def predict(model, iterator, criterion):
    
    
    model.eval()
    final_outputs = []  
    test_id = []  
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text,text_lengths).squeeze(1)
            predictions = predictions.argmax(1, keepdim = True) 
                                   
            final_outputs+=predictions.view(-1).cpu().data.numpy().tolist()
            test_id+=batch.ID.view(-1).cpu().numpy().tolist()
    return test_id,final_outputs

In [51]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_,pred= predict(model,test_iterator, criterion)

In [52]:
test_ = [ID.vocab.itos[i] for i in test_]

In [53]:
submission = pd.DataFrame({'ID':test_,'label':pred})
submission.label = submission.label.replace({2:0, 0:1, 1: -1})
submission.to_csv('lstm.csv',index=False)

In [62]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 27s
	Train Loss: 0.691 | Train Acc: 67.05%
	 Val. Loss: 0.597 |  Val. Acc: 73.64%
Epoch: 02 | Epoch Time: 0m 27s
	Train Loss: 0.606 | Train Acc: 73.23%
	 Val. Loss: 0.546 |  Val. Acc: 76.71%
Epoch: 03 | Epoch Time: 0m 26s
	Train Loss: 0.561 | Train Acc: 75.59%
	 Val. Loss: 0.517 |  Val. Acc: 78.06%
Epoch: 04 | Epoch Time: 0m 27s
	Train Loss: 0.522 | Train Acc: 77.79%
	 Val. Loss: 0.504 |  Val. Acc: 78.39%
Epoch: 05 | Epoch Time: 0m 27s
	Train Loss: 0.491 | Train Acc: 79.41%
	 Val. Loss: 0.489 |  Val. Acc: 79.49%
Epoch: 06 | Epoch Time: 0m 27s
	Train Loss: 0.466 | Train Acc: 80.65%
	 Val. Loss: 0.481 |  Val. Acc: 79.97%
Epoch: 07 | Epoch Time: 0m 27s
	Train Loss: 0.445 | Train Acc: 81.79%
	 Val. Loss: 0.481 |  Val. Acc: 80.06%
Epoch: 08 | Epoch Time: 0m 27s
	Train Loss: 0.425 | Train Acc: 82.63%
	 Val. Loss: 0.480 |  Val. Acc: 80.36%
Epoch: 09 | Epoch Time: 0m 27s
	Train Loss: 0.408 | Train Acc: 83.51%
	 Val. Loss: 0.484 |  Val. Acc: 80.48%
Epoch: 10 | Epoch T

In [63]:
def predict(model, iterator, criterion):
    
    
    model.eval()
    final_outputs = []  
    test_id = []  
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text,text_lengths).squeeze(1)
            predictions = predictions.argmax(1, keepdim = True) 
                                   
            final_outputs+=predictions.view(-1).cpu().data.numpy().tolist()
            test_id+=batch.ID.view(-1).cpu().numpy().tolist()
    return test_id,final_outputs

In [64]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_,pred= predict(model,test_iterator, criterion)
test_ = [ID.vocab.itos[i] for i in test_]

In [65]:
submission = pd.DataFrame({'ID':test_,'label':pred})
submission.label = submission.label.replace({2:0, 0:1, 1: -1})
submission.to_csv('gru.csv',index=False)

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model,valid_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.496 | Test Acc: 79.06%
