<a href="https://colab.research.google.com/github/dainis-boumber/av/blob/master/pretrain_av.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ULMFiT + Siamese Network for Sentence Vectors
## Part One: Tokenizing
This notebook will tokenize the sentences from the SNLI dataset for use in the next notebook

### You must have the fastai library installed

In [0]:
%reload_ext autoreload
%autoreload 2
from ipyexperiments import *
from fastai.text import *
from fastai import *
import json
import html
import re
import pickle
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict
from pandas.io.json import json_normalize
import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F
from nltk.tokenize import sent_tokenize
import time
import math
import sys
import data
import joblib

token_files = './data/PAN14/tokens/'
model_files = './data/PAN14/models/'
TRAINDATAPATH = "./data/PAN14/pan14_train_english-essays/"
TESTDATAPATH = "./data/PAN14/pan14_test01_english-essays/"
FNAMES = ['known01','known02','known03','known04','known05', 'unknown']
KCOLS=['known01','known02','known03','known04','known05']
LABELCOL="answer"
UNKOWN="unknown"


In [0]:
# ! wget https://github.com/briandw/SiameseULMFiT/releases/download/1/data.zip
# ! unzip ./data.zip

In [0]:
BOD = 'x_bod' # beginning-of-doc tag

re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def read_dataset(path):
    ds=pd.read_json(path+'/truth.json')
    ds=json_normalize(ds['problems'])
    ds['known01']=None
    ds['known02']=None
    ds['known03']=None
    ds['known04']=None
    ds['known05']=None
    ds['unknown']=None
    ds.set_index('name', drop=True, inplace=True)
    ds=ds[['known01','known02','known03','known04','known05', 'unknown', 'answer']]
    dirs = []
    docs = []

    for i, x in enumerate(os.walk(path)):
        if i:
            for fname in x[2]:
                with open(path+dirs[i-1]+'/'+fname, 'r') as f:
                    text = f.read().strip()
                    doc = BOD.join(sent_tokenize(text))
                    docs.append(doc)
                    ds.loc[dirs[i-1],fname[:-4]]=doc
        else:
            dirs = x[1]

    return ds, docs

def match_unknowns(path):
    ds=pd.read_json(path+'/truth.json')
    ds=json_normalize(ds['problems'])
    ds['known01']=None
    ds['known02']=None
    ds['known03']=None
    ds['known04']=None
    ds['known05']=None
    ds['unknown']=None
    ds.set_index('name', drop=True, inplace=True)
    ds=ds[['known01','known02','known03','known04','known05', 'unknown', 'answer']]
    dirs = []
    docs = []

    ds, _ = read_dataset(path)
            
    grouped=ds.groupby(['unknown'])
    dupes=[]
    for utext, group in grouped:
        if len(group.index) > 1:
            dupes.append(group)


    newrows=pd.DataFrame(columns=['known01','known02','known03','known04','known05', 'unknown'])
    for dupe in dupes:
        dupe.reset_index(drop=True, inplace=True)
        yes=dupe.loc[dupe.answer == "Y"]
        yes.reset_index(drop=True, inplace=True)
        no=dupe.loc[dupe.answer == "N"]
        no.reset_index(drop=True, inplace=True)
        for col in ['known01','known02','known03','known04','known05']:
            if no[col] is not None:
                newrows=newrows.append(pd.DataFrame(data={'known01':yes.known01,'known02':yes.known02,
                                                          'known03':yes.known03, 'known04':yes.known04,
                                                          'known05':yes.known05,'unknown':no[col], 
                                                          'answer':'N'}), sort=False)
    newrows=newrows.dropna(subset=['unknown'])
    df = pd.concat([ds, newrows])
    for col in FNAMES:
        
        docs.extend(df[col].tolist())
    docs=[d for d in docs if d is not None]
    return df, docs

df_train, docs = match_unknowns(TRAINDATAPATH)

df_test, _ = read_dataset(TESTDATAPATH)

In [0]:
def load_sentence_pairs(df):
   
    s0s = []
    s1s = []
    labels = [1 if label == 'Y' else 0 for label in df[LABELCOL].tolist()]
    
    y=[]
    unknowns = df[UNKOWN].tolist()
    for i, label in enumerate(labels):
        for col in KCOLS:
            knowns = df[col].tolist()

            s0 = knowns[i]
            if s0 is not None:
                s1 = unknowns[i]
                s0s.append(s0)
                s1s.append(s1)
                y.append(label)
    pairs=pd.DataFrame(data={"known":s0s, "unknown":s1s, "label":y})
    return pairs  

sentence_pairs_train = load_sentence_pairs(df_train)
sentence_pairs_val = load_sentence_pairs(df_test)
sentence_pairs_test = load_sentence_pairs(df_test)

In [0]:
joblib.dump(sentence_pairs_train, f'{model_files}traindf.pkl')
joblib.dump(sentence_pairs_val, f'{model_files}valdf.pkl')
joblib.dump(sentence_pairs_test, f'{model_files}testdf.pkl')

['./data/PAN14/models/testdf.pkl']

In [0]:
sentence_pairs_train['label']=0
sentence_pairs_val['label']=0
sentence_pairs_test['label']=0

In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(model_files, sentence_pairs_train, sentence_pairs_val, sentence_pairs_test,
                                  text_cols=['known', 'unknown'], label_cols=['label'], mark_fields=True)
data_lm.save()                              

In [0]:
data_lm = TextLMDataBunch.load(model_files)

In [0]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103_1, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.307320,4.082573,0.250004


In [0]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,3.792723,3.911833,0.271624


In [0]:
learn.save_encoder('ft_enc')

# Save the language model training set