# SLNI 706 project- Sequence analysis course

In [1]:
#Please make sure all import work

#General imports
import os
import re
import json
import torch
from torch.utils import data
import pandas as pd
import datetime
from transformers import BertConfig
from torch.utils.tensorboard import SummaryWriter
import random
import numpy as np

#Project models imports
import params
import utils
from bert_model import BertTransformer
from training import train_snli
import transformer_model
import basic_rnn
import rnn_combined_model
import snli_dataset
import main_slni

In [2]:

#Data files- Please make sure you have them
TRAIN_DATA_FILE= './data/snli_1.0/snli_1.0_train.jsonl'
VAL_DATA_FILE='./data/snli_1.0/snli_1.0_dev.jsonl'
TEST_DATA_FILE='./data/snli_1.0/snli_1.0_test.jsonl'

#Needed directories -Please make sure you have them.
MODELS_DIR='./saved_models/'
RUNS_DIR='./experiments/'
RESULTS_DIR='./results'

#Please make sure you have this files under MODELS_DIR .
#This is a file I have prepared containing the vocabulary , otherwise the dataset will take long to process.
VOCAB_FILE='vocab_counter.pkl'
TRAIN_TOKENIZED_FILE='train_tokenized_datapoints.pickle'
VAL_TOKENIZED_FILE='val_tokenized_datapoints.pickle'

#Best models-saved checkpoints
BEST_MODEL_BASIC_RNN='./saved_models/best_models/best_basic_rnn.th'
BEST_MODEL_TRANSFORMER='./saved_models/best_models/best_basic_rnn.th'
BEST_MODEL_RNN_COMBINE='./saved_models/best_models/best_basic_rnn.th'
BEST_MODEL_BERT='./saved_models/best_models/best_basic_rnn.th'

In [3]:
#Set seed to reproduce results
SEED=42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)



In [4]:
#Set device

device = torch.device('cpu')
if (torch.cuda.is_available()):
    device = torch.device('cuda')

print(device)

cuda


We first prepare the datasets for training and validation


In [5]:

train_dataset = snli_dataset.SNLIDataset(data_path=TRAIN_DATA_FILE, saved_dir=MODELS_DIR, device=device,  vocab_file=VOCAB_FILE,tokenized_datapoints_file=TRAIN_TOKENIZED_FILE)
print(f"Train dataset size: {len(train_dataset)} ")

train_vocab=train_dataset.vocab
val_dataset = snli_dataset.SNLIDataset(data_path=VAL_DATA_FILE, saved_dir=MODELS_DIR, device=device, vocab_external=train_vocab,tokenized_datapoints_file=VAL_TOKENIZED_FILE)
print(f"Validation dataset size: {len(val_dataset)} ")

#Count number of datapoint in each label
count_neural=0
count_entailment=0
count_contradiction=0
for datapoint in train_dataset.datapoints:
    if datapoint[2]=="neutral":
        count_neural+=1
    elif datapoint[2]=="contradiction":
        count_contradiction+=1
    elif datapoint[2]=="entailment":
        count_entailment+=1

print(f"Count neutral: {count_neural}")
print(f"Count contradiction: {count_contradiction}")
print(f"Count entailment: {count_entailment}")

Train dataset size: 549367 
Validation dataset size: 9842 
Count neutral: 182764
Count contradiction: 183187
Count entailment: 183416


We can see that the dataset has a good balance

In [6]:
print("Data examples")
print(f"First sentence:{train_dataset.datapoints[0][0]}")
print(f"Second sentence:{train_dataset.datapoints[0][1]}")
print(f"Label:{train_dataset.datapoints[0][2]}\n")

print(f"First sentence:{train_dataset.datapoints[1][0]}")
print(f"Second sentence:{train_dataset.datapoints[1][1]}")
print(f"Label:{train_dataset.datapoints[1][2]}")


Data examples
First sentence:A person on a horse jumps over a broken down airplane.
Second sentence:A person is training his horse for a competition.
Label:neutral

First sentence:A person on a horse jumps over a broken down airplane.
Second sentence:A person is at a diner, ordering an omelette.
Label:contradiction


In [7]:
#utils.save_to_pickle(train_dataset.tokenized_datapoints, './saved_models/train_tokenized_datapoints.pickle')
#utils.save_to_pickle(val_dataset.tokenized_datapoints, './saved_models/val_tokenized_datapoints.pickle')

###Train Basic RNN model


In [8]:
#This RNN configuration is constant
print(params.RNN_CONFIG_CONSTANT_CONFIG)

#We will tune the following hyperparams possibilities
RNN_CONFIG_LIST=[{'run_name': 'lr:0.003_embedding:glove','num_epochs':1, 'lr':0.003, 'embedding_type': 'glove', 'checkpoint': None},
                 {'run_name': 'lr:0.001_embedding:glove','num_epochs':1, 'lr':0.001, 'embedding_type': 'glove', 'checkpoint': None},
                 {'run_name': 'lr:0.003_embedding:None','num_epochs':1, 'lr':0.003, 'embedding_type': None, 'checkpoint': None},
                 {'run_name': 'lr:0.001_embedding:None','num_epochs':1, 'lr':0.001, 'embedding_type': None, 'checkpoint': None}]

{'num_classes': 3, 'hidden_size': 512, 'embeddings_dim': 300, 'hidden_pre2_classifier_linear_dim': 256, 'hidden_pre1_classifier_linear_dim': 64, 'pad_token': '<pad>', 'dropout_rate': 0.1}


In [None]:
BasicRNN_results_pd=main_slni.train_BasicRNN(train_dataset, val_dataset, RNN_CONFIG_LIST, device, params.RNN_CONFIG_CONSTANT_CONFIG)


Start Training BasicRNN lr:0.003_embedding:glove 
Step 1000/17168 Train running loss: 0.68
Step 2000/17168 Train running loss: 0.81
Step 3000/17168 Train running loss: 0.88
Step 4000/17168 Train running loss: 0.79
Step 5000/17168 Train running loss: 0.95
Step 6000/17168 Train running loss: 0.79
Step 7000/17168 Train running loss: 0.79


In [None]:
#Show results
utils.save_to_pickle(BasicRNN_results_pd, './saved_models/basic_rnn_results_df.pickle')

In [None]:
#This RNN configuration is constant
print(params.RNN_COMBINE_CONSTANT_CONFIG)

#We will tune the following hyperparams possibilities
RNN_COMBINE_CONFIG_LIST=[{'run_name': 'lr:0.0003_embedding:glove','num_epochs':1, 'lr':0.0003, 'embedding_type': 'glove', 'checkpoint': None},
                         {'run_name': 'lr:0.0001_embedding:glove','num_epochs':1, 'lr':0.0001, 'embedding_type': 'glove', 'checkpoint': None},
                         {'run_name': 'lr:0.0003_embedding:None','num_epochs':1, 'lr':0.0003, 'embedding_type': None, 'checkpoint': None},
                         {'run_name': 'lr:0.0001_embedding:None','num_epochs':1, 'lr':0.0001, 'embedding_type': None, 'checkpoint': None}]

In [None]:
RNNCombine_results_pd=main_slni.train_RNNCombine(train_dataset, val_dataset, RNN_COMBINE_CONFIG_LIST, device, params.RNN_COMBINE_CONSTANT_CONFIG)

In [None]:
utils.save_to_pickle(RNNCombine_results_pd, './saved_models/rnn_combine_results_df.pickle')

In [None]:
#This RNN configuration is constant
print(params.TRANSFORMER_CONSTANT_CONFIG)

TRANSFORMER_CONFIG_LIST=[{'run_name': 'lr:5e-5_embedding:glove', 'num_epochs':1, 'lr':5e-5, 'embedding_type':'glove', 'checkpoint':None},
                         {'run_name': 'lr:3e-5_embedding:glove', 'num_epochs':1, 'lr':3e-5, 'embedding_type':'glove', 'checkpoint':None},
                         {'run_name': 'lr:5e-5_embedding:glove', 'num_epochs':1, 'lr':5e-5, 'embedding_type':None, 'checkpoint':None},
                         {'run_name': 'lr:3e-5_embedding:glove', 'num_epochs':1, 'lr':3e-5, 'embedding_type':None, 'checkpoint':None}]


In [None]:
Transformer_results_pd=main_slni.train_Transformer(train_dataset, val_dataset, TRANSFORMER_CONFIG_LIST, device,params.TRANSFORMER_CONSTANT_CONFIG)

In [None]:
utils.save_to_pickle(Transformer_results_pd, './saved_models/transformer_results_df.pickle')

In [None]:
print(params.BERT_CONSTANT_CONFIG)


BERT_CONFIG_LIST=[{'run_name': ' lr:5e-5', 'num_epochs':1, 'lr':5e-5,'embedding_type': 'bert', 'checkpoint': None},
                  {'run_name': ' lr:5e-5', 'num_epochs':1, 'lr':3e-5,'embedding_type': 'bert', 'checkpoint': None}]

In [None]:
BERT_results_pd=main_slni.train_Bert(train_dataset, val_dataset, BERT_CONFIG_LIST, device,**params.BERT_CONSTANT_CONFIG)


In [None]:
utils.save_to_pickle(BERT_results_pd, './saved_models/bert_results_df.pickle')

In [None]:
#SHOW RESULTS

In [None]:
#Run best results
#combine and show comapriosm graphs

#Attentions graphs


