# ULMFiT + Siamese Network for Sentence Vectors
## Part One: Tokenizing
This notebook will tokenize the sentences from the SNLI dataset for use in the next notebook

### You must have the fastai library installed

In [4]:
# import dependencies
%matplotlib inline
from fastai.text import *
import json
import html
import re
import pickle
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F
import time
import math
import sys
import data
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.io.json import json_normalize #package for flattening json in pandas df
from IPython.display import display
from sklearn import metrics
import os
import logging
from ipyexperiments import IPyExperimentsPytorch
from ipygpulogger import IPyGPULogger
import itertools
import joblib


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sns.set_style("darkgrid")

TRAINDATAPATH = "data/PAN14/pan14_train_english-essays/"
TESTDATAPATH = "data/PAN14/pan14_test01_english-essays/"
FNAMES = ['known01','known02','known03','known04','known05', 'unknown']
KNOWN=['known01','known02','known03','known04','known05']

token_files = './data/tokens/'
snli_root = './data/snli_1.0/'

In [5]:
def  permute(row):
    docs=[row[col] for col in KNOWN if row[col] is not None]
    return list(itertools.combinations(docs, 2))

def match_unknowns(path):
    ds=pd.read_json(path+'/truth.json')
    ds=json_normalize(ds['problems'])
    ds['known01']=None
    ds['known02']=None
    ds['known03']=None
    ds['known04']=None
    ds['known05']=None
    ds['unknown']=None
    ds.set_index('name', drop=True, inplace=True)
    ds=ds[['known01','known02','known03','known04','known05', 'unknown', 'answer']]
    dirs = []
    docs = []

    for i, x in enumerate(os.walk(path)):
        if i:
            for fname in x[2]:
                with open(path+dirs[i-1]+'/'+fname, 'r') as f:
                    text = f.read()
                    doc = text.strip()
                    ds.loc[dirs[i-1],fname[:-4]]=doc
        else:
            dirs = x[1]
            
    grouped=ds.groupby(['unknown'])
    dupes=[]
    for utext, group in grouped:
        if len(group.index) > 1:
            dupes.append(group)


    newrows=pd.DataFrame(columns=['known01','known02','known03','known04','known05', 'unknown'])
    for dupe in dupes:
        dupe.reset_index(drop=True, inplace=True)
        yes=dupe.loc[dupe.answer == "Y"]
        yes.reset_index(drop=True, inplace=True)
        no=dupe.loc[dupe.answer == "N"]
        no.reset_index(drop=True, inplace=True)
        for col in ['known01','known02','known03','known04','known05']:
            if no[col] is not None:
                newrows=newrows.append(pd.DataFrame(data={'known01':yes.known01,'known02':yes.known02,
                                                          'known03':yes.known03, 'known04':yes.known04,
                                                          'known05':yes.known05,'unknown':no[col], 
                                                          'answer':'N'}), sort=False)
    newrows=newrows.dropna(subset=['unknown'])

    #docs=[d for d in docs if d is not None]
    return newrows

def read_dataset(path):
    ds=pd.read_json(path+'/truth.json')
    ds=json_normalize(ds['problems'])
    ds['known01']=None
    ds['known02']=None
    ds['known03']=None
    ds['known04']=None
    ds['known05']=None
    ds['unknown']=None
    ds.set_index('name', drop=True, inplace=True)
    ds=ds[['known01','known02','known03','known04','known05', 'unknown', 'answer']]
    dirs = []
    docs = []

    for i, x in enumerate(os.walk(path)):
        if i:
            for fname in x[2]:
                with open(path+dirs[i-1]+'/'+fname, 'r') as f:
                    text = f.read()
                    doc = text.strip()
                    docs.append(doc)
                    ds.loc[dirs[i-1],fname[:-4]]=doc
        else:
            dirs = x[1]

    return ds, docs


train, docs = read_dataset(TRAINDATAPATH)
test, _ = read_dataset(TESTDATAPATH)

In [7]:
train['yes_pairs']=train.apply(lambda row: permute(row) if row['answer'] == 'Y' else None , axis=1)
train['pairs']=train.apply(lambda row: [(row[col], row['unknown']) for col in KNOWN if row[col] is not None], axis=1)
train['I_pairs']=train.apply(lambda row: [(row[col], row[col]) for col in FNAMES if row[col] is not None],axis=1)
matched = match_unknowns(TRAINDATAPATH)
matched['matched']=matched.apply(lambda row: [(row[col], row['unknown']) for col in KNOWN if row[col] is not None],axis=1)
test['pairs']=test.apply(lambda row: [(row[col], row['unknown']) for col in KNOWN if row[col] is not None], axis=1)

known = []
unknown = []
answers=[]
pairs=train['pairs'].tolist()
ans=train['answer']
train['yes_pairs']=train['yes_pairs'].fillna('')
yes_pairs = train['yes_pairs'].tolist()
I_pairs = train['I_pairs'].tolist()

for pair, yes_pair, ipair, a in zip(pairs, yes_pairs, I_pairs, ans):
    for p in pair:
        known.append(p[0])
        unknown.append(p[1])
        answers.append(a)
    for p in ipair:
        known.append(p[0])
        unknown.append(p[1])
        answers.append(a)
    for p in yes_pair:
        known.append(p[0])
        unknown.append(p[1])
        answers.append(a)


ans = matched['answer']
matched_pairs=matched['matched'].tolist()

for matched_pair, a in zip(matched_pairs, ans):
    for p in matched_pair:
        known.append(p[0])
        unknown.append(p[1])
        answers.append(a)

ans = test['answer']
test_pairs=test['pairs'].tolist()
ktest=[]
utest=[]
anstest=[]
for test_pair, a in zip(test_pairs, ans):
    for p in test_pair:
        ktest.append(p[0])
        utest.append(p[1])
        anstest.append(a)
        
test_df = pd.DataFrame(data={"label":anstest, "known":ktest, "unknown":utest})
test_df=test_df.sample(frac=1.0).reset_index(drop=True)
train_df = pd.DataFrame(data={"label":answers, "known":known, "unknown":unknown})
train_df=train_df.sample(frac=1.0).reset_index(drop=True)
joblib.dump(train_df, 'data/train.pkl')
joblib.dump(test_df, 'data/val.pkl')
gc.collect()

0

In [None]:
#! wget https://github.com/briandw/SiameseULMFiT/releases/download/1/data.zip
#! unzip ./data.zip

In [2]:
# load and process the all the sentences, just to get the LM trained
raw_text = []
for file in [f"{snli_root}snli_1.0_train.jsonl", f"{snli_root}snli_1.0_dev.jsonl", f"{snli_root}snli_1.0_test.jsonl"]:
    with open(file) as fp:
        while True:
            line = fp.readline()
            if line != None and len(line) > 0:
                item = json.loads(line)
                raw_text.append(item['sentence1'])
                raw_text.append(item['sentence2'])
            else:
                break
print(len(raw_text))

1140304


In [8]:
#split the language model data into train and validation sets
lm_train, lm_valid = sklearn.model_selection.train_test_split(raw_text, test_size=0.1)
df_trn = pd.DataFrame(lm_train)
df_val = pd.DataFrame(lm_valid)

In [9]:
BOS = 'x_bos'  # beginning-of-sentence tag

re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df):
    texts = df[0].astype(str)
    texts = list(texts.apply(fixup).values)
    texts = f'{BOS} ' + df[0].astype(str)
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok

In [10]:
tok_trn = np.concatenate(get_texts(df_trn))
tok_val = np.concatenate(get_texts(df_val))

In [11]:
tok_val[:100]

array(['x_bos', 'people', 'are', 'in', 'a', 'room', 'discussing', 'around', 'a', 'computer', 'printer', '.',
       'x_bos', 'the', 'girl', 'is', 'playing', 'on', 'a', 'swing', 'set', '.', 'x_bos', 'a', 'cleaning',
       'woman', 'in', 'a', 'bright', 'uniform', 'is', 'pushing', 'a', 'cart', '.', 'x_bos', 'animals',
       'playing', 'in', 'a', 'field', 'x_bos', 'a', 'person', 'is', 'taking', 'a', 'picture', 'of', 'some',
       'kids', '.', 'x_bos', 'protesters', 'joining', 'on', 'a', 'city', 'street', '.', 'x_bos', 'bicyclist',
       'riding', 'their', 'bikes', 'across', 'a', 'metal', 'bridge', '.', 'x_bos', 'a', 'man', 'is',
       'holding', 'a', 'flashlight', '.', 'x_bos', 'the', 'team', 'swiftly', 'moves', 'their', 'traditional',
       'boat', 'down', 'the', 'river', '.', 'x_bos', 'a', 'man', 'with', 'long', 'hair', 'and', 'a', 'pink',
       'shirt'], dtype='<U17')

In [12]:
#save our work
np.save(f'{token_files}tok_trn.npy', tok_trn)
np.save(f'{token_files}tok_val.npy', tok_val)

In [13]:
tok_trn = np.load(f'{token_files}tok_trn.npy')
tok_val = np.load(f'{token_files}tok_val.npy')

In [14]:
freq = Counter(np.concatenate([tok_trn, tok_val]))
freq.most_common(25)

[('a', 1496301),
 ('x_bos', 1140304),
 ('.', 999604),
 ('the', 555295),
 ('in', 423992),
 ('is', 387917),
 ('man', 276785),
 ('on', 245180),
 ('and', 215231),
 ('are', 206834),
 ('of', 200547),
 ('with', 176178),
 ('woman', 143101),
 ('two', 126950),
 ('people', 125650),
 (',', 119923),
 ('to', 118745),
 ('at', 102452),
 ('wearing', 84424),
 ('an', 83451),
 ('his', 75557),
 ('shirt', 65479),
 ('young', 64126),
 ('men', 63408),
 ('playing', 61568)]

In [15]:
len(freq)

34158

In [16]:
max_vocab = 60000
min_freq = 1
itos = [o for o, c in freq.most_common(max_vocab) if c>=min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

In [17]:
len(stoi)

34160

# Save the language model training set

In [18]:
trn_lm = np.array([stoi[p] for p in tok_trn])
val_lm = np.array([stoi[p] for p in tok_val])

In [19]:
#save results
pickle.dump(itos, open(f'{token_files}itos.pkl', 'wb'))
np.save(f'{token_files}trn_lm.npy', trn_lm)
np.save(f'{token_files}val_lm.npy', val_lm)

In [20]:
#load the results so we can pick it up from here 
itos = pickle.load(open(f'{token_files}itos.pkl', 'rb'))
trn_lm = np.load(f'{token_files}trn_lm.npy')
val_lm = np.load(f'{token_files}val_lm.npy')

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

34160

In [21]:
for word in val_lm[:100]:
    print(itos[word], end=" ")

x_bos people are in a room discussing around a computer printer . x_bos the girl is playing on a swing set . x_bos a cleaning woman in a bright uniform is pushing a cart . x_bos animals playing in a field x_bos a person is taking a picture of some kids . x_bos protesters joining on a city street . x_bos bicyclist riding their bikes across a metal bridge . x_bos a man is holding a flashlight . x_bos the team swiftly moves their traditional boat down the river . x_bos a man with long hair and a pink shirt 

# Build the sentence similarity dataset

In [53]:
from enum import Enum

class Entail(Enum):
    entailment = 0
    contradiction = 1
    neutral = 2
       
def load_sentence_pairs(json_file):
    content = []
    with open(json_file) as fp:
        while True:
            line = fp.readline()
            if line:
                content.append(json.loads(line))
            else:
                break

    s0s = []
    s1s = []
    labels = []
    avg_len = []
    for item in content:
        l = item['gold_label']
        s0 = BOS+" "+fixup(item['sentence1'])
        s1 = BOS+" "+fixup(item['sentence2'])

        average_len = (len(s0)+len(s1))/2
        try:
            label = Entail[l].value
            s0s.append(s0)
            s1s.append(s1)
            labels.append(label)
            avg_len.append(average_len)
        except KeyError:
            pass
        
    s0s = Tokenizer().proc_all_mp(partition_by_cores(s0s))
    s1s = Tokenizer().proc_all_mp(partition_by_cores(s1s))
    return np.array((s0s, s1s, labels, avg_len)).transpose()    

sentence_pairs_train = load_sentence_pairs(f'{snli_root}/snli_1.0_train.jsonl')
sentence_pairs_dev = load_sentence_pairs(f'{snli_root}snli_1.0_dev.jsonl')
sentence_pairs_test = load_sentence_pairs(f'{snli_root}snli_1.0_test.jsonl')

In [54]:
np.save(f'{token_files}trn_snli.npy', sentence_pairs_train)
np.save(f'{token_files}dev_snli.npy', sentence_pairs_dev)
np.save(f'{token_files}test_snli.npy', sentence_pairs_test)

In [57]:
def tokenize(sentence_pairs):
    for i in range(len(sentence_pairs)):
        item = sentence_pairs[i]
        tok0 = [stoi[p] for p in item[0]]
        tok1 =[stoi[p] for p in item[1]]
        sentence_pairs[i] = np.array([tok0, tok1, item[2], item[3]])

tokenize(sentence_pairs_train)
tokenize(sentence_pairs_dev)
tokenize(sentence_pairs_test)

In [62]:
np.save(f'{token_files}snli_tok_train.npy', sentence_pairs_train)
np.save(f'{token_files}snli_tok_dev.npy', sentence_pairs_dev)
np.save(f'{token_files}snli_tok_test.npy', sentence_pairs_test)

# Check our work

In [18]:
itos = pickle.load(open(f'{token_files}itos.pkl', 'rb'))

dev = np.load(f'{token_files}snli_tok_dev.npy')
train = np.load(f'{token_files}snli_tok_train.npy')
test = np.load(f'{token_files}snli_tok_test.npy')

def print_sentence(s):
    sentence = ""
    for tok in s:
        sentence += " "+itos[tok]
    print(sentence)

print_sentence(train[0][0])
print_sentence(train[0][1])

print_sentence(dev[0][0])
print_sentence(dev[0][1])

print_sentence(test[0][0])
print_sentence(test[0][1])


 x_bos a person on a horse jumps over a broken down airplane .
 x_bos a person is training his horse for a competition .
 x_bos two women are embracing while holding to go packages .
 x_bos the sisters are hugging goodbye while holding to go packages after just eating lunch .
 x_bos this church choir sings to the masses as they sing joyous songs from the book at a church .
 x_bos the church has cracks in the ceiling .


 x_bos two women are embracing while holding to go packages .
 x_bos the sisters are hugging goodbye while holding to go packages after just eating lunch .


[3, 15, 47, 11, 2243, 30, 48, 18, 381, 3644, 4]