In [5]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [4]:
import pprint
import math
import random

import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from pathlib import Path

from language_structure import *
from train import batch_iter, load
from model import *
from utils import *

base = Path('../data')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# datasets
list(base.iterdir())

[PosixPath('../data/.DS_Store'),
 PosixPath('../data/aclImdb'),
 PosixPath('../data/QQP'),
 PosixPath('../data/cola_public'),
 PosixPath('../data/QNLI'),
 PosixPath('../data/RTE')]

# IMDB

In [32]:
tmp = pd.read_csv('../data/aclImdb/train.csv')
tmp.head()

Unnamed: 0,path,target,review_rating,file_length
0,train/neg/1821_4.txt,0,4,41
1,train/neg/10402_1.txt,0,1,188
2,train/neg/1062_4.txt,0,4,122
3,train/neg/9056_1.txt,0,1,354
4,train/neg/5392_3.txt,0,3,794


In [19]:
for x, y, lengths in IMDBLoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (25000, 25000)


In [20]:
x, y, lengths, x.shape

(tensor([[ 101,  101],
         [2821, 6758],
         [4658, 3185],
         [3422, 9643],
         [4333, 1012],
         [4438, 3666],
         [ 999, 9117],
         [2472, 2245],
         [6819, 3185],
         [7983, 2071]]), tensor([[0, 1],
         [1, 0]]), [96, 92], torch.Size([10, 2]))

# QQP Dataset

In [22]:
data = base/'QQP'

In [25]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [26]:
testdf.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,201359,303345,303346,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0
1,263843,69383,380476,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0
2,172974,266948,175089,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0
3,15329,29298,29299,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0
4,209794,314169,314170,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0


In [27]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(15, 363192)

In [28]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(1, 40372)

In [29]:
filtered_df = testdf[~testdf.isnull().any(axis=1)]
assert len(filtered_df) == (len(testdf) - len(testdf[testdf.isnull().any(axis=1)]))
testdf = filtered_df

In [112]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [30]:
# df = traindf
df = testdf

lengths = []
for q1, q2 in zip(df['question1'].values, df['question2'].values):
    l1 = len(q1.split(' '))
    l2 = len(q2.split(' '))
    lengths.append(l1+l2)
assert len(lengths) == len(df), '{} != {}'.format(len(lengths), len(df))

length_df = pd.DataFrame({'file_length': lengths})
df = pd.concat([df, length_df], axis=1)

In [31]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,file_length
0,201359,303345,303346.0,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0,10.0
1,263843,69383,380476.0,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0,43.0
2,172974,266948,175089.0,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0,16.0
3,15329,29298,29299.0,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0,17.0
4,209794,314169,314170.0,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0,22.0


In [118]:
len(df)

363192

In [32]:
name = 'test.csv'
df.to_csv(data/name, index=False)

# QNLI

In [308]:
data = base/'QNLI'

In [329]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 10344: expected 4 fields, saw 5\nSkipping line 10897: expected 4 fields, saw 5\nSkipping line 11356: expected 4 fields, saw 5\nSkipping line 11367: expected 4 fields, saw 5\nSkipping line 16599: expected 4 fields, saw 5\nSkipping line 17114: expected 4 fields, saw 5\nSkipping line 23153: expected 4 fields, saw 5\nSkipping line 25672: expected 4 fields, saw 5\nSkipping line 31107: expected 4 fields, saw 5\nSkipping line 31359: expected 4 fields, saw 5\nSkipping line 31402: expected 4 fields, saw 5\nSkipping line 32555: expected 4 fields, saw 5\nSkipping line 38524: expected 4 fields, saw 5\nSkipping line 46338: expected 4 fields, saw 5\nSkipping line 47889: expected 4 fields, saw 5\nSkipping line 56759: expected 4 fields, saw 5\nSkipping line 56850: expected 4 fields, saw 5\nSkipping line 56919: expected 4 fields, saw 5\nSkipping line 57514: expected 4 fields, saw 5\nSkipping line 67155: expected 4 fields, saw 5\nSkipping line 75061: expected 4 fields, saw 5\nSkipping li

Unnamed: 0,index,question,sentence,label
0,0,When did the third Digimon series begin?,Unlike the two seasons before it and most of t...,not_entailment
1,1,Which missile batteries often have individual ...,"When MANPADS is operated by specialists, batte...",not_entailment
2,2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,entailment
3,3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",entailment
4,4,What famous palace is located in London?,London contains four World Heritage Sites: the...,not_entailment


In [330]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(0, 103106)

In [331]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 5266)

In [332]:
df = testdf

In [333]:
df.head()

Unnamed: 0,index,question,sentence,label
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment


In [334]:
lengths = []
int_label = []
for q, s, l in zip(df['question'].values, df['sentence'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [335]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [336]:
df.head()

Unnamed: 0,index,question,sentence,label,file_length,targets
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment,24,1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment,35,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment,32,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment,63,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment,29,0


# RTE

In [400]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(1, 2490)

In [401]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 277)

In [402]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [408]:
df = testdf
lengths = []
int_label = []
for q, s, l in zip(df['sentence1'].values, df['sentence1'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [409]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [411]:
name = 'test.csv'
df.to_csv(data/name, index=False)

# Cola

In [422]:
data = base/'cola_public/raw'

In [432]:
traindf = pd.read_csv(data/'in_domain_train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
traindf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']
testdf = pd.read_csv(data/'out_of_domain_dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
testdf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']

In [433]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(6024, 8551)

In [434]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(353, 516)

In [437]:
# Column 1:	the code representing the source of the sentence.
# Column 2:	the acceptability judgment label (0=unacceptable, 1=acceptable).
# Column 3:	the acceptability judgment as originally notated by the author. 
# Column 4:	the sentence.
traindf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [436]:
testdf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,clc95,1,,Somebody just left - guess who.
1,clc95,1,,"They claimed they had settled on something, bu..."
2,clc95,1,,"If Sam was going, Sally would know where."
3,clc95,1,,"They're going to serve the guests something, b..."
4,clc95,1,,She's reading. I can't imagine what.


In [446]:
df = testdf
lengths = []
for q in df['sentence'].values:
    lengths.append(len(q.split(' ')))

In [447]:
length_df = pd.DataFrame({'file_length': lengths})
df = pd.concat([df, length_df], axis=1)

In [448]:
df.head()

Unnamed: 0,sentence_source,label,author_judged,sentence,file_length
0,clc95,1,,Somebody just left - guess who.,6
1,clc95,1,,"They claimed they had settled on something, bu...",16
2,clc95,1,,"If Sam was going, Sally would know where.",8
3,clc95,1,,"They're going to serve the guests something, b...",11
4,clc95,1,,She's reading. I can't imagine what.,6


In [449]:
name = 'test.csv'
df.to_csv(data/name, index=False)