In [5]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [4]:
import pprint
import math
import random

import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from pathlib import Path

from language_structure import *
from train import batch_iter, load
from model import *
from utils import *

base = Path('../data')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# datasets
list(base.iterdir())

[PosixPath('../data/.DS_Store'),
 PosixPath('../data/aclImdb'),
 PosixPath('../data/QQP'),
 PosixPath('../data/cola_public'),
 PosixPath('../data/QNLI'),
 PosixPath('../data/RTE')]

# IMDB

In [32]:
tmp = pd.read_csv('../data/aclImdb/train.csv')
tmp.head()

Unnamed: 0,path,target,review_rating,file_length
0,train/neg/1821_4.txt,0,4,41
1,train/neg/10402_1.txt,0,1,188
2,train/neg/1062_4.txt,0,4,122
3,train/neg/9056_1.txt,0,1,354
4,train/neg/5392_3.txt,0,3,794


In [6]:
class BatchLoader:
    def __init__(self, base, tokenizer, max_len, device, size):
        self.base = base
        self.size = size
        self.traindf = pd.read_csv(base/'train.csv')
        self.testdf = pd.read_csv(base/'test.csv')
        self.max_len = max_len
        self.device = device
        self.tokenizer = tokenizer
        
        if self.size:
            self.traindf = self.traindf.sample(frac=1.)
            self.testdf = self.testdf.sample(frac=1.)
            self.traindf = self.traindf[:self.size]
            self.testdf = self.testdf[:self.size]
        print("Length of (Train, Test) : ({}, {})".format(len(self.traindf), len(self.testdf)))
    
    def prepare(self, df):
        raise NotImplementedError("BatchLoader Prepare Not Implemented")
    
    def tokenize(self, sents):
        raise NotImplementedError("BatchLoader Tokenize Not Implemented")
        
    def batch_iter(self, batch_size, train=True, shuffle=False, process_full_df=False, show_progress=False):
        data = self.traindf if train else self.testdf
        tmpdf = data.copy()
        
        if shuffle:
            tmpdf = tmpdf.sample(frac=1.)
            
        count = 0
        # file length lower and upper bound to batch together
        n = 10
        
        while len(tmpdf) > 0:
            count += 1
            
            # grab first row
            length = tmpdf['file_length'].values[0]
            lb, ub = length - n, length + n

            # find similar lengthed files
            file_lengths = tmpdf.file_length.values
            fl_idxs = tmpdf.file_length.index
            idxs = [i for i, fl in zip(fl_idxs, file_lengths) if (fl >= lb and fl <= ub)]

            # break early if we dont want to process the full dataframe 
            if not process_full_df:
                if len(idxs) < batch_size / 2 and count > 6: break
            elif show_progress and count % 10 == 0:
                print('Examples Left: {}'.format(len(tmpdf)))

            # shuffle & get batch
            random.shuffle(idxs)
            idxs = idxs[:batch_size] if len(idxs) > batch_size else idxs
            batchdf = tmpdf.loc[idxs]

            # remove selected batch rows from main df
            tmpdf = tmpdf[~tmpdf.index.isin(batchdf.index)]

            # open, clean, index txt files 
            sents, targets = self.prepare(batchdf)
            
            # tokenize and tensorize
            x, lengths = self.tokenize(sents)
            y = torch.nn.functional.one_hot(torch.tensor(targets, device=self.device))
        
            yield x, y.squeeze(), lengths
        

In [7]:
def BERT_tokenize_tokens(tokenizer, bert_tokens, max_len):
    token_ids = [tokenizer.convert_tokens_to_ids(ts) for ts in bert_tokens]
    token_lengths = list(map(len, token_ids))
    token_ids = clip_pad_to_max(token_ids, max_len, 0)
    token_tensor = torch.tensor(token_ids, dtype=torch.long, device=device)

    # return token_tensor, token_lengths
    return torch.t(token_tensor), token_lengths

In [17]:
class IMDBLoader(SingleSentenceLoader):
    def __init__(self, tokenizer, max_len, device, base=Path('../data/aclImdb'), size=None):
        super().__init__(base, tokenizer, max_len, device, size)
        
    def open_and_clean(self, path):
        file = open(str(self.base/path), encoding='utf-8').read()
        # contractions = True for BERT consistency 
        clean_file = normalizeString(file, stopwords=False, contractions=True)
        return clean_file
        
    def prepare(self, df):
        results = [(self.open_and_clean(p), t) for (p, t) in zip(df['path'].values, df['target'].values)] 
        results = sorted(results, key=lambda e: len(e[0].split(' ')), reverse=True)
        sents, targets = [e[0] for e in results], [e[1] for e in results]
        return sents, targets

In [18]:
from bert import tokenization
vocab_file = './uncased_L-12_H-768_A-12/vocab.txt'
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

In [19]:
for x, y, lengths in IMDBLoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (25000, 25000)


In [20]:
x, y, lengths, x.shape

(tensor([[ 101,  101],
         [2821, 6758],
         [4658, 3185],
         [3422, 9643],
         [4333, 1012],
         [4438, 3666],
         [ 999, 9117],
         [2472, 2245],
         [6819, 3185],
         [7983, 2071]]), tensor([[0, 1],
         [1, 0]]), [96, 92], torch.Size([10, 2]))

# QQP Dataset

In [22]:
data = base/'QQP'

In [25]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 83032: expected 6 fields, saw 7\n'
b'Skipping line 154657: expected 6 fields, saw 7\n'
b'Skipping line 323916: expected 6 fields, saw 7\n'


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222.0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0.0
1,402555,536040,536041.0,How do I control my horny emotions?,How do you control your horniness?,1.0
2,360472,364011,490273.0,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0.0
3,150662,155721,7256.0,What can one do after MBBS?,What do i do after my MBBS ?,1.0
4,183004,279958,279959.0,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0.0


In [26]:
testdf.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,201359,303345,303346,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0
1,263843,69383,380476,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0
2,172974,266948,175089,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0
3,15329,29298,29299,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0
4,209794,314169,314170,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0


In [27]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(15, 363192)

In [28]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(1, 40372)

In [29]:
filtered_df = testdf[~testdf.isnull().any(axis=1)]
assert len(filtered_df) == (len(testdf) - len(testdf[testdf.isnull().any(axis=1)]))
testdf = filtered_df

In [112]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [30]:
# df = traindf
df = testdf

lengths = []
for q1, q2 in zip(df['question1'].values, df['question2'].values):
    l1 = len(q1.split(' '))
    l2 = len(q2.split(' '))
    lengths.append(l1+l2)
assert len(lengths) == len(df), '{} != {}'.format(len(lengths), len(df))

length_df = pd.DataFrame({'file_length': lengths})
df = pd.concat([df, length_df], axis=1)

In [31]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,file_length
0,201359,303345,303346.0,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0,10.0
1,263843,69383,380476.0,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0,43.0
2,172974,266948,175089.0,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0,16.0
3,15329,29298,29299.0,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0,17.0
4,209794,314169,314170.0,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0,22.0


In [118]:
len(df)

363192

In [32]:
name = 'test.csv'
df.to_csv(data/name, index=False)

In [514]:
class TwoSentenceLoader(BatchLoader):
    def __init__(self, base, tokenizer, max_len, device, size=None):
        super().__init__(base, tokenizer, max_len, device, size)
        self.number_classes = 2
        
    def tokenize(self, bert_tokens):
        # alread been tokenized by prepare
        return BERT_tokenize_tokens(self.tokenizer, bert_tokens, self.max_len)
        
    def prepare(self, df):
        sentences = []
        targets = []
        for q, s, y in zip(df[self.s1_name].values,\
                           df[self.s2_name].values,\
                           df[self.target_name].values):
            # clean
            q = normalizeString(q, stopwords=True, contractions=True)
            s = normalizeString(s, stopwords=True, contractions=True)
            # tokenize
            q_tokens = tokenizer.tokenize(q)
            s_tokens = tokenizer.tokenize(s)
            # combine
            s = ['[CLS]'] + q_tokens + ['[SEP]'] + s_tokens
            sentences.append(s)
            targets.append(int(y))
        return sentences, targets

In [515]:
class QQPLoader(TwoSentenceLoader):
    def __init__(self, tokenizer, max_len, device, base=Path('../data/QQP'), size=None):
        super().__init__(base, tokenizer, max_len, device, size)    
        self.s1_name = 'question1'
        self.s2_name = 'question2'
        self.target_name = 'is_duplicate'


In [516]:
for x, y, lengths in QQPLoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (363192, 390965)


In [517]:
x, y, lengths

(tensor([[  101,   101],
         [ 2097,  2515],
         [21029, 19102],
         [ 1998,  9088],
         [ 3964,  3602],
         [ 2064,  2490],
         [ 2644,  1043],
         [ 1996,  1029],
         [ 2304,   102],
         [ 2769,  2054]]), tensor([[0, 1],
         [1, 0]]), [31, 21])

# QNLI

In [308]:
data = base/'QNLI'

In [329]:
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

b'Skipping line 10344: expected 4 fields, saw 5\nSkipping line 10897: expected 4 fields, saw 5\nSkipping line 11356: expected 4 fields, saw 5\nSkipping line 11367: expected 4 fields, saw 5\nSkipping line 16599: expected 4 fields, saw 5\nSkipping line 17114: expected 4 fields, saw 5\nSkipping line 23153: expected 4 fields, saw 5\nSkipping line 25672: expected 4 fields, saw 5\nSkipping line 31107: expected 4 fields, saw 5\nSkipping line 31359: expected 4 fields, saw 5\nSkipping line 31402: expected 4 fields, saw 5\nSkipping line 32555: expected 4 fields, saw 5\nSkipping line 38524: expected 4 fields, saw 5\nSkipping line 46338: expected 4 fields, saw 5\nSkipping line 47889: expected 4 fields, saw 5\nSkipping line 56759: expected 4 fields, saw 5\nSkipping line 56850: expected 4 fields, saw 5\nSkipping line 56919: expected 4 fields, saw 5\nSkipping line 57514: expected 4 fields, saw 5\nSkipping line 67155: expected 4 fields, saw 5\nSkipping line 75061: expected 4 fields, saw 5\nSkipping li

Unnamed: 0,index,question,sentence,label
0,0,When did the third Digimon series begin?,Unlike the two seasons before it and most of t...,not_entailment
1,1,Which missile batteries often have individual ...,"When MANPADS is operated by specialists, batte...",not_entailment
2,2,What two things does Popper argue Tarski's the...,He bases this interpretation on the fact that ...,entailment
3,3,What is the name of the village 9 miles north ...,"On 31 December 1853, the Ottoman forces at Cal...",entailment
4,4,What famous palace is located in London?,London contains four World Heritage Sites: the...,not_entailment


In [330]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(0, 103106)

In [331]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 5266)

In [332]:
df = testdf

In [333]:
df.head()

Unnamed: 0,index,question,sentence,label
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment


In [334]:
lengths = []
int_label = []
for q, s, l in zip(df['question'].values, df['sentence'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [335]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [336]:
df.head()

Unnamed: 0,index,question,sentence,label,file_length,targets
0,0,What came into force after the new constitutio...,"As of that day, the new constitution heralding...",entailment,24,1
1,1,What is the first major city in the stream of ...,The most important tributaries in this area ar...,not_entailment,35,0
2,2,What is the minimum required if you want to te...,In most provinces a second Bachelor's Degree s...,not_entailment,32,0
3,3,How was Temüjin kept imprisoned by the Tayichi...,The Tayichi'ud enslaved Temüjin (reportedly wi...,entailment,63,1
4,4,"What did Herr Gott, dich loben wir become know...","He paraphrased the Te Deum as ""Herr Gott, dich...",not_entailment,29,0


In [518]:
class QNLILoader(TwoSentenceLoader):
    def __init__(self, tokenizer, max_len, device, base=Path('../data/QNLI'), size=None):
        super().__init__(base, tokenizer, max_len, device, size)        
        
        self.s1_name = 'question'
        self.s2_name = 'sentence'
        self.target_name = 'targets'

In [519]:
for x, y, lengths in QNLILoader(max_len=100, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (103106, 5266)


In [520]:
x, y, lengths

(tensor([[  101,   101],
         [ 2054,  2040],
         [ 2001,  2001],
         [ 1996,  2715],
         [ 2590,  5483],
         [ 6651,  1055],
         [ 1997,  2034],
         [ 1996,  2343],
         [ 4461,  1029],
         [14263,   102],
         [ 2213,  1999],
         [ 1029,  2715],
         [  102,  5483],
         [ 1996,  1055],
         [ 4461,  2034],
         [14263,  2343],
         [ 2213, 22834],
         [ 2001, 11639],
         [ 2028,  2483],
         [ 1997, 10556],
         [ 1996, 27633],
         [ 2087,  2923],
         [ 2590,  4360],
         [ 4204,  2015],
         [ 1997,  2170],
         [ 1996,  2005],
         [ 6188,  2586],
         [ 2457,  1997],
         [ 1012,  9719],
         [    0,  2007],
         [    0,  5483],
         [    0,  1998],
         [    0,  3365],
         [    0,  3576],
         [    0, 10138],
         [    0,  2015],
         [    0,  2165],
         [    0,  2173],
         [    0,  1012]]), tensor([[0, 1],
       

In [521]:
" ".join(tokenizer.convert_ids_to_tokens(x[:, 1].detach().numpy()))

'[CLS] who was modern greece s first president ? [SEP] in modern greece s first president io ##ann ##is ka ##pod ##ist ##ria ##s called for union of cyprus with greece and numerous minor uprising ##s took place .'

In [522]:
y

tensor([[0, 1],
        [0, 1]])

# RTE

In [398]:
data = base/'RTE'
traindf = pd.read_csv(data/'train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
testdf = pd.read_csv(data/'dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False)
traindf.head()

Unnamed: 0,index,sentence1,sentence2,label
0,0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,not_entailment
1,1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,entailment
2,2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,entailment
3,3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,entailment
4,4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,not_entailment


In [399]:
testdf.head()

Unnamed: 0,index,sentence1,sentence2,label
0,0,"Dana Reeve, the widow of the actor Christopher...",Christopher Reeve had an accident.,not_entailment
1,1,"Yet, we now are discovering that antibiotics a...",Bacteria is winning the war against antibiotics.,entailment
2,2,Cairo is now home to some 15 million people - ...,15 million tonnes of rubbish are produced dail...,not_entailment
3,3,"The Amish community in Pennsylvania, which num...",Pennsylvania has the biggest Amish community i...,not_entailment
4,4,Security forces were on high alert after an el...,Security forces were on high alert after a cam...,entailment


In [400]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(1, 2490)

In [401]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(0, 277)

In [402]:
filtered_df = traindf[~traindf.isnull().any(axis=1)]
assert len(filtered_df) == (len(traindf) - len(traindf[traindf.isnull().any(axis=1)]))
traindf = filtered_df

In [408]:
df = testdf
lengths = []
int_label = []
for q, s, l in zip(df['sentence1'].values, df['sentence1'].values, df['label'].values):
    l1 = len(q.split(' '))
    l2 = len(s.split(' '))
    i_label = 0 if l == 'not_entailment' else 1
    assert l in ['not_entailment', 'entailment'], "Value {} not found".format(l)
    lengths.append(l1+l2)
    int_label.append(i_label)

In [409]:
length_df = pd.DataFrame({'file_length': lengths,
                          'targets': int_label})
df = pd.concat([df, length_df], axis=1)

In [410]:
df.head()

Unnamed: 0,index,sentence1,sentence2,label,file_length,targets
0,0,"Dana Reeve, the widow of the actor Christopher...",Christopher Reeve had an accident.,not_entailment,46,0
1,1,"Yet, we now are discovering that antibiotics a...",Bacteria is winning the war against antibiotics.,entailment,62,1
2,2,Cairo is now home to some 15 million people - ...,15 million tonnes of rubbish are produced dail...,not_entailment,254,0
3,3,"The Amish community in Pennsylvania, which num...",Pennsylvania has the biggest Amish community i...,not_entailment,198,0
4,4,Security forces were on high alert after an el...,Security forces were on high alert after a cam...,entailment,46,1


In [416]:
max(df['file_length'].values)

328

In [411]:
name = 'test.csv'
df.to_csv(data/name, index=False)

In [523]:
class RTELoader(TwoSentenceLoader):
    def __init__(self, tokenizer, max_len, device, base=Path('../data/RTE'), size=None):
        super().__init__(base, tokenizer, max_len, device, size)    
        self.s1_name = 'sentence1'
        self.s2_name = 'sentence2'
        self.target_name = 'targets'

In [524]:
for x, y, lengths in RTELoader(max_len=100, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (2490, 277)


In [525]:
x, y, lengths

(tensor([[  101,   101],
         [ 1996,  1057],
         [16807,  1012],
         [ 2078,  1055],
         [18541,  1012],
         [ 2622,  3187],
         [ 1038,  1997],
         [ 5369,  2110],
         [ 2361, 25805],
         [ 8681, 10559],
         [ 1996,  4143],
         [ 2810,  5785],
         [ 1997,  2038],
         [ 1037,  5228],
         [12464,  2014],
         [18541,  4963],
         [ 5477,  2044],
         [ 1996,  1037],
         [ 6726,  3116],
         [ 1997,  2007],
         [ 2049, 25603],
         [ 6451,  2343],
         [ 1998, 13192],
         [ 1996, 13222],
         [ 2311,  2632],
         [ 1997, 24234],
         [ 3141,  4313],
         [ 6502,  2076],
         [ 2164,  2029],
         [ 3229,  2014],
         [ 4925, 14895],
         [ 1012,  2015],
         [  102,  1998],
         [ 1037,  3095],
         [18541,  2020],
         [ 2622, 11094],
         [ 2003, 29313],
         [ 3818,  2094],
         [ 2030,  2011],
         [ 2003, 25603],


In [526]:
" ".join(tokenizer.convert_ids_to_tokens(x[:, 0].detach().numpy()))

'[CLS] the baku ##n hydroelectric project b ##he ##p comprises the construction of a mw hydroelectric dam the transmission of its electricity and the building of related infrastructure including access roads . [SEP] a hydroelectric project is proposed or is under construction . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

# Cola

In [422]:
data = base/'cola_public/raw'

In [432]:
traindf = pd.read_csv(data/'in_domain_train.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
traindf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']
testdf = pd.read_csv(data/'out_of_domain_dev.tsv', delimiter='\t', encoding='utf-8', error_bad_lines=False, header=None)
testdf.columns = ['sentence_source', 'label', 'author_judged', 'sentence']

In [433]:
len(traindf[traindf.isnull().any(axis=1)]), len(traindf)

(6024, 8551)

In [434]:
len(testdf[testdf.isnull().any(axis=1)]), len(testdf)

(353, 516)

In [437]:
# Column 1:	the code representing the source of the sentence.
# Column 2:	the acceptability judgment label (0=unacceptable, 1=acceptable).
# Column 3:	the acceptability judgment as originally notated by the author. 
# Column 4:	the sentence.
traindf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [436]:
testdf.head()

Unnamed: 0,sentence_source,label,author_judged,sentence
0,clc95,1,,Somebody just left - guess who.
1,clc95,1,,"They claimed they had settled on something, bu..."
2,clc95,1,,"If Sam was going, Sally would know where."
3,clc95,1,,"They're going to serve the guests something, b..."
4,clc95,1,,She's reading. I can't imagine what.


In [446]:
df = testdf
lengths = []
for q in df['sentence'].values:
    lengths.append(len(q.split(' ')))

In [447]:
length_df = pd.DataFrame({'file_length': lengths})
df = pd.concat([df, length_df], axis=1)

In [448]:
df.head()

Unnamed: 0,sentence_source,label,author_judged,sentence,file_length
0,clc95,1,,Somebody just left - guess who.,6
1,clc95,1,,"They claimed they had settled on something, bu...",16
2,clc95,1,,"If Sam was going, Sally would know where.",8
3,clc95,1,,"They're going to serve the guests something, b...",11
4,clc95,1,,She's reading. I can't imagine what.,6


In [449]:
name = 'test.csv'
df.to_csv(data/name, index=False)

In [13]:
class SingleSentenceLoader(BatchLoader):
    def __init__(self, base, tokenizer, max_len, device, size=None):
        super().__init__(base, tokenizer, max_len, device, size)
        self.number_classes = 2
        
    def tokenize(self, sentences):
        bert_tokens = [['[CLS]'] + self.tokenizer.tokenize(s) for s in sentences]
        return BERT_tokenize_tokens(self.tokenizer, bert_tokens, self.max_len)

In [14]:
class COLALoader(SingleSentenceLoader):
    def __init__(self, tokenizer, max_len, device, base=Path('../data/cola_public/raw'), size=None):
        super().__init__(base, tokenizer, max_len, device, size)
        
    def prepare(self, df):
        results = list(zip(df['sentence'].values, df['label'].values))
        sents, targets = [e[0] for e in results], [e[1] for e in results]
        return sents, targets

In [15]:
for x, y, lengths in COLALoader(max_len=10, device=device, tokenizer=tokenizer).batch_iter(batch_size=2, train=True, shuffle=True):
    break

Length of (Train, Test) : (8551, 516)


In [16]:
x, y, lengths

(tensor([[  101,   101],
         [ 1996,  1045],
         [ 2775,  4687],
         [ 1998,  2054],
         [ 2014,  2103],
         [ 2388,  2008],
         [14752, 10900],
         [ 1012,  3908],
         [    0,  1012]]), tensor([1, 1]), [8, 9])

In [530]:
x.shape

torch.Size([10, 2])

In [531]:
" ".join(tokenizer.convert_ids_to_tokens(x[:, 0].detach().numpy()))

'[CLS] smith was anne ##aling . [PAD] [PAD] [PAD] [PAD]'

In [532]:
" ".join(tokenizer.convert_ids_to_tokens(x[:, 1].detach().numpy()))

'[CLS] all special rights of voting in the election were'