In [2]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
import pprint
import math
import random

import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from pathlib import Path

from language_structure import *
from train import load
from model import *
from utils import *

base = Path('../data')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# datasets
list(base.iterdir())

[PosixPath('../data/.DS_Store'),
 PosixPath('../data/aclImdb'),
 PosixPath('../data/QQP'),
 PosixPath('../data/cola_public'),
 PosixPath('../data/QNLI'),
 PosixPath('../data/RTE')]

# IMDB

In [32]:
tmp = pd.read_csv('../data/aclImdb/train.csv')
tmp.head()

Unnamed: 0,path,target,review_rating,file_length
0,train/neg/1821_4.txt,0,4,41
1,train/neg/10402_1.txt,0,1,188
2,train/neg/1062_4.txt,0,4,122
3,train/neg/9056_1.txt,0,1,354
4,train/neg/5392_3.txt,0,3,794


In [19]:
for x, y, lengths in IMDBLoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (25000, 25000)


In [20]:
x, y, lengths, x.shape

(tensor([[ 101,  101],
         [2821, 6758],
         [4658, 3185],
         [3422, 9643],
         [4333, 1012],
         [4438, 3666],
         [ 999, 9117],
         [2472, 2245],
         [6819, 3185],
         [7983, 2071]]), tensor([[0, 1],
         [1, 0]]), [96, 92], torch.Size([10, 2]))

# QQP Dataset

In [22]:
data = base/'QQP'

In [25]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [26]:
testdf.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,201359,303345,303346,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0
1,263843,69383,380476,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0
2,172974,266948,175089,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0
3,15329,29298,29299,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0
4,209794,314169,314170,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0


In [27]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(15, 363192)

In [28]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(1, 40372)

In [29]:
filtered_df = testdf[~testdf.isnull().any(axis=1)]
assert len(filtered_df) == (len(testdf) - len(testdf[testdf.isnull().any(axis=1)]))
testdf = filtered_df

In [112]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [30]:
# df = traindf
df = testdf

lengths = []
for q1, q2 in zip(df['question1'].values, df['question2'].values):
    l1 = len(q1.split(' '))
    l2 = len(q2.split(' '))
    lengths.append(l1+l2)
assert len(lengths) == len(df), '{} != {}'.format(len(lengths), len(df))

length_df = pd.DataFrame({'file_length': lengths})
df = pd.concat([df, length_df], axis=1)

In [31]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,file_length
0,201359,303345,303346.0,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0,10.0
1,263843,69383,380476.0,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0,43.0
2,172974,266948,175089.0,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0,16.0
3,15329,29298,29299.0,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0,17.0
4,209794,314169,314170.0,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0,22.0


In [118]:
len(df)

363192

In [32]:
name = 'test.csv'
df.to_csv(data/name, index=False)

# QNLI

In [4]:
data = base/'QNLI'

In [5]:
traindf = pd.read_csv(data/'train.csv')
traindf.head()

Unnamed: 0,index,question,sentence,label,file_length,targets
0,0,When did the third Digimon series begin?,Unlike the two seasons before it and most of t...,not_entailment,50,0
1,1,Which missile batteries often have individual ...,"When MANPADS is operated by specialists, batte...",not_entailment,37,0
2,2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,entailment,41,1
3,3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",entailment,52,1
4,4,What famous palace is located in London?,London contains four World Heritage Sites: the...,not_entailment,52,0


In [8]:
testdf = pd.read_csv(data/'test.csv')
testdf.head()

Unnamed: 0,index,question,sentence,label,file_length,targets
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment,24,1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment,35,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment,32,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment,63,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment,29,0


In [7]:
max(traindf['targets'].values), min(traindf['targets'].values)

(1, 0)

In [9]:
max(testdf['targets'].values), min(testdf['targets'].values)

(1, 0)

In [329]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 10344: expected 4 fields, saw 5\nSkipping line 10897: expected 4 fields, saw 5\nSkipping line 11356: expected 4 fields, saw 5\nSkipping line 11367: expected 4 fields, saw 5\nSkipping line 16599: expected 4 fields, saw 5\nSkipping line 17114: expected 4 fields, saw 5\nSkipping line 23153: expected 4 fields, saw 5\nSkipping line 25672: expected 4 fields, saw 5\nSkipping line 31107: expected 4 fields, saw 5\nSkipping line 31359: expected 4 fields, saw 5\nSkipping line 31402: expected 4 fields, saw 5\nSkipping line 32555: expected 4 fields, saw 5\nSkipping line 38524: expected 4 fields, saw 5\nSkipping line 46338: expected 4 fields, saw 5\nSkipping line 47889: expected 4 fields, saw 5\nSkipping line 56759: expected 4 fields, saw 5\nSkipping line 56850: expected 4 fields, saw 5\nSkipping line 56919: expected 4 fields, saw 5\nSkipping line 57514: expected 4 fields, saw 5\nSkipping line 67155: expected 4 fields, saw 5\nSkipping line 75061: expected 4 fields, saw 5\nSkipping li

Unnamed: 0,index,question,sentence,label
0,0,When did the third Digimon series begin?,Unlike the two seasons before it and most of t...,not_entailment
1,1,Which missile batteries often have individual ...,"When MANPADS is operated by specialists, batte...",not_entailment
2,2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,entailment
3,3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",entailment
4,4,What famous palace is located in London?,London contains four World Heritage Sites: the...,not_entailment


In [330]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(0, 103106)

In [10]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 5266)

In [332]:
df = testdf

In [333]:
df.head()

Unnamed: 0,index,question,sentence,label
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment


In [334]:
lengths = []
int_label = []
for q, s, l in zip(df['question'].values, df['sentence'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [335]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [336]:
df.head()

Unnamed: 0,index,question,sentence,label,file_length,targets
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment,24,1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment,35,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment,32,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment,63,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment,29,0


# RTE

In [400]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(1, 2490)

In [401]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 277)

In [402]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [408]:
df = testdf
lengths = []
int_label = []
for q, s, l in zip(df['sentence1'].values, df['sentence1'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [409]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [411]:
name = 'test.csv'
df.to_csv(data/name, index=False)

# Cola

In [8]:
data = base/'cola_public/raw'

In [31]:
traindf = pd.read_csv(data/'in_domain_train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
traindf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']
testdf = pd.read_csv(data/'out_of_domain_dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
testdf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']
test2df = pd.read_csv(data/'in_domain_dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
test2df.columns = ['sentence_source', 'label', 'author_judged', 'sentence']

In [32]:
len(traindf), len(testdf), len(test2df)

(8551, 516, 527)

In [437]:
# Column 1:	the code representing the source of the sentence.
# Column 2:	the acceptability judgment label (0=unacceptable, 1=acceptable).
# Column 3:	the acceptability judgment as originally notated by the author. 
# Column 4:	the sentence.
traindf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [436]:
testdf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,clc95,1,,Somebody just left - guess who.
1,clc95,1,,"They claimed they had settled on something, bu..."
2,clc95,1,,"If Sam was going, Sally would know where."
3,clc95,1,,"They're going to serve the guests something, b..."
4,clc95,1,,She's reading. I can't imagine what.


In [37]:
df = testdf
lengths = []
for q in df['sentence'].values:
    lengths.append(len(q.split(' ')))
length_df = pd.DataFrame({'file_length': lengths})
testdf = pd.concat([df, length_df], axis=1)

In [33]:
df = test2df
lengths2 = []
for q in df['sentence'].values:
    lengths2.append(len(q.split(' ')))
length_df = pd.DataFrame({'file_length': lengths2})
test2df = pd.concat([df, length_df], axis=1)

In [35]:
test2df.head()

Unnamed: 0,sentence_source,label,author_judged,sentence,file_length
0,gj04,1,,The sailors rode the breeze clear of the rocks.,9
1,gj04,1,,The weights made the rope stretch over the pul...,9
2,gj04,1,,The mechanical doll wriggled itself loose.,6
3,cj99,1,,"If you had eaten more, you would want less.",9
4,cj99,0,*,"As you eat the most, you want the least.",9


In [38]:
testdf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence,file_length
0,clc95,1,,Somebody just left - guess who.,6
1,clc95,1,,"They claimed they had settled on something, bu...",16
2,clc95,1,,"If Sam was going, Sally would know where.",8
3,clc95,1,,"They're going to serve the guests something, b...",11
4,clc95,1,,She's reading. I can't imagine what.,6


In [39]:
combined_test_df = pd.concat([test2df, testdf])

In [40]:
combined_test_df.head()

Unnamed: 0,sentence_source,label,author_judged,sentence,file_length
0,gj04,1,,The sailors rode the breeze clear of the rocks.,9
1,gj04,1,,The weights made the rope stretch over the pul...,9
2,gj04,1,,The mechanical doll wriggled itself loose.,6
3,cj99,1,,"If you had eaten more, you would want less.",9
4,cj99,0,*,"As you eat the most, you want the least.",9


In [41]:
len(combined_test_df)

1043

In [42]:
name = 'test.csv'
combined_test_df.to_csv(data/name, index=False)

# BERT Embeddings

In [4]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [5]:
from bert_embedding import BertEmbedding
import torch

from bert import tokenization
from dataloader import *
from bert_pytorch.model.bert import BERTClassificationWrapper

vocab_file = './uncased_L-12_H-768_A-12/vocab.txt'
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [58]:
bert_embedding = BertEmbedding(max_seq_length=512)

In [59]:
bert_embedding.vocab

Vocab(size=30522, unk="[UNK]", reserved="['[PAD]', '[CLS]', '[SEP]', '[MASK]']")

In [6]:
for x, y, lengths in IMDBLoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (25000, 25000)


In [13]:
x.shape

torch.Size([10, 2])

In [27]:
tmp = x[:, 0]
test = [tokenizer.inv_vocab[x] for x in list(tmp.numpy())]
test

['[CLS]',
 'seldom',
 'give',
 'movie',
 'without',
 'seeing',
 'entire',
 'show',
 '.',
 'particularly']

In [None]:
data_iter = [([10654, 17214, 0], )]
batches = []
for token_ids, valid_length, token_types in data_iter:
    token_ids = token_ids.as_in_context(self.ctx)
    valid_length = valid_length.as_in_context(self.ctx)
    token_types = token_types.as_in_context(self.ctx)
    sequence_outputs = self.bert(token_ids, token_types,
                                 valid_length.astype(self.dtype))
    for token_id, sequence_output in zip(token_ids.asnumpy(),
                                         sequence_outputs.asnumpy()):
        batches.append((token_id, sequence_output))

In [69]:
bert_embedding(['ham', 'tam', '[PAD]'])

[(['ham'],
  [array([-3.38359922e-01,  1.01256169e-01, -1.32628679e-01, -2.93269277e-01,
           9.54813883e-02, -4.67324704e-02, -1.97138071e-01,  4.72903587e-02,
          -1.04295596e-01, -6.67657375e-01, -5.73687911e-01,  8.13928768e-02,
           1.76142260e-01,  1.30207747e-01, -4.96848822e-01, -4.52552795e-01,
           3.82123709e-01,  3.62274125e-02,  3.76703620e-01,  2.47355863e-01,
           5.05938679e-02,  2.55153447e-01, -4.43232715e-01, -3.65845084e-01,
           3.53737980e-01,  2.01405898e-01, -2.59094536e-01, -1.33673042e-01,
          -1.54754341e-01, -1.22871727e-01,  1.56584650e-01, -3.89058173e-01,
           7.91553974e-01, -1.99571699e-01, -7.24579573e-01, -1.56402677e-01,
          -2.07132503e-01,  1.82814673e-01, -9.26156700e-01, -6.53900653e-02,
           2.76014209e-01, -4.40659732e-01,  7.32305884e-01, -4.88031417e-01,
           1.82419389e-01,  4.70257580e-01,  9.24911261e-01, -2.40214393e-01,
          -3.46169591e-01, -3.35906804e-01, -9.837255

In [65]:
tokenizer.convert_tokens_to_ids(['ham', 'tam', '[PAD]'])

[10654, 17214, 0]

In [None]:
bert_embedding.bert()

In [67]:
x = torch.tensor([[10654, 17214, 0]])
x.shape

torch.Size([1, 3])

In [68]:
test = bert_embedding(x).shape

TypeError: ord() expected string of length 1, but Tensor found

In [57]:
test

[(['pad'],
  [array([-3.62636775e-01, -4.23182666e-01,  4.15025085e-01, -1.70481101e-01,
           1.84334755e-01,  2.75152475e-01,  8.94734189e-02,  4.18507487e-01,
          -4.17335033e-02, -7.95167327e-01, -5.14654577e-01,  1.99347824e-01,
           3.69613349e-01, -1.65389165e-01, -1.64152488e-01, -8.66690725e-02,
           2.01758012e-01, -2.51660764e-01,  5.53673148e-01,  5.93602777e-01,
          -9.86261517e-02, -4.29566577e-03, -2.17691690e-01, -3.27470452e-02,
           1.68252259e-01,  5.99489629e-01, -3.62707466e-01, -3.12679261e-01,
          -8.32247138e-01,  9.60382879e-01,  1.03092641e-01, -2.28214785e-01,
           1.18403740e-01,  3.51720452e-01, -4.53335434e-01, -4.38132286e-01,
          -3.74912441e-01,  8.12049359e-02, -9.40661848e-01, -3.20155680e-01,
          -5.94555102e-02, -5.21800220e-01,  4.53978002e-01, -8.16599786e-01,
           2.63463289e-01,  3.19811642e-01, -6.59938455e-02, -7.12440312e-02,
          -5.13563156e-01,  2.06201404e-01, -1.422621

In [50]:
for t in test:
    print(t[1][0].shape)

(768,)


In [37]:
len(tokenizer.vocab)

30522

In [39]:
tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[unused0]', 1),
             ('[unused1]', 2),
             ('[unused2]', 3),
             ('[unused3]', 4),
             ('[unused4]', 5),
             ('[unused5]', 6),
             ('[unused6]', 7),
             ('[unused7]', 8),
             ('[unused8]', 9),
             ('[unused9]', 10),
             ('[unused10]', 11),
             ('[unused11]', 12),
             ('[unused12]', 13),
             ('[unused13]', 14),
             ('[unused14]', 15),
             ('[unused15]', 16),
             ('[unused16]', 17),
             ('[unused17]', 18),
             ('[unused18]', 19),
             ('[unused19]', 20),
             ('[unused20]', 21),
             ('[unused21]', 22),
             ('[unused22]', 23),
             ('[unused23]', 24),
             ('[unused24]', 25),
             ('[unused25]', 26),
             ('[unused26]', 27),
             ('[unused27]', 28),
             ('[unused28]', 29),
             ('[unused29]', 30),
  