In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import os
import numpy as np
import torchtext
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from transformers import *
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split
%matplotlib inline
# import spacy

In [2]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'utils.py',
 'aclImdb_v1.tar.gz',
 'aclImdb',
 'preprocessing.ipynb',
 'model.py',
 'classifier.py',
 'join_datasets.ipynb',
 'base.py',
 '__pycache__']

In [5]:
data = pd.read_csv('train_data.csv')
data['L'] = data['text'].str.count(' ')

In [6]:
data = data.query('L < 180')

In [7]:
data.shape

(515, 3)

In [8]:
train, cv = train_test_split(data, test_size=0.1)

In [9]:
train.to_csv('train.csv', index=None)
cv.to_csv('cv.csv', index=None)

In [10]:
train.shape, cv.shape

((463, 3), (52, 3))

In [11]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
         ]

In [12]:
model_class, tokenizer_class, weights = MODELS[-2]

In [13]:
tokenizer = tokenizer_class.from_pretrained(weights)

In [14]:
tokenize = lambda x: tokenizer.tokenize(x)

In [16]:
tokenize('Will is a massive cunt')

['will', 'is', 'a', 'massive', 'cu', '##nt']

In [None]:
# en = spacy.load('en_core_web_sm')
# def tokenize(sentence):
#     return [tok.text for tok in en.tokenizer(sentence)]

In [None]:
# from torchtext.vocab import Vocab

In [17]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [None]:
Field()

In [18]:
TEXT = Field(sequential=True, 
             tokenize=tokenize, 
             use_vocab=True,
             init_token = tokenizer.cls_token,
             pad_token=tokenizer.pad_token, 
             unk_token=tokenizer.unk_token,
             pad_first=False, 
             batch_first=True)
LABEL = Field(use_vocab=False, sequential=False)

In [19]:
datafields = [('text', TEXT), ('label', LABEL)]

In [20]:
trn, cv = TabularDataset.splits(path='.',
                                train='train.csv', 
                                validation='cv.csv', 
                                format='csv', 
                                skip_header=True, 
                                fields=datafields)

In [21]:
TEXT.build_vocab(trn, cv)

In [22]:
stoi = dict(tokenizer.vocab)

In [23]:
itos = list(stoi.keys()) 

In [24]:
TEXT.vocab.stoi = stoi

In [25]:
TEXT.vocab.itos = itos

In [37]:
train_iter, val_iter = BucketIterator.splits(
 (trn, cv), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(6, 6),
 device=torch.device('cpu'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=True,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [38]:
batch = next(iter(train_iter))

In [39]:
batch.text

tensor([[  101,  2009,  1005,  1055,  2182,  1012,  2633,  1037,  3185,  3310,
          2041,  2008,  1045,  2064,  9826,  2360,  2003,  4788,  2084,  6554,
          1996,  5830,  3124,  1024,  2740,  7742,  1012,  2664,  1045,  1005,
          1049,  5627,  2000,  6655,  1996,  1996,  2126,  2319,  1005,  1055,
          3428,  1006, 26316,  1007,  2097,  2191,  2062,  2769,  2084,  1045,
          2412,  2191,  1999,  2026,  2878,  2166,  2006,  2054,  2003,  2469,
          2000,  2022,  2028,  1997,  1996,  2327,  2274,  5409,  3152,  1997,
          2035,  2051,  1010,  2648,  1997,  2026,  3587,  3694,  2648,  1996,
          2465,  2128,  1011, 26465,  1997, 12390,  1998, 13707,  1012,  1045,
          2812,  2428,  2339,  2052,  3087,  2412,  2412,  2156,  2023,  3185,
          4983,  2027,  2020,  3825,  2000,  1012,  1996,  4038,  2003,  5410,
          1998,  2035,  2130, 19512,  6057, 13198,  2013,  1996, 13109,  5714,
          6508,  5436,  2020,  7543,  3936,  1999, 1

In [45]:
tokenizer.decode(batch.text[4].tolist())



In [None]:
batch.label[0]

In [None]:
for data in train_iter:
    x = data
    break

In [None]:
x.text.shape

In [None]:
model = model_class.from_pretrained(weights)

In [None]:
model.eval()

In [None]:
res = model(x.text)[0]

In [None]:
res.shape

### Export

In [None]:
train_iter, val_iter = BucketIterator.splits(
 (trn, cv), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(6, 6),
 device=torch.device('cuda'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [None]:
torch.save(train_iter, 'train_iter.pt')
torch.save(val_iter, 'cv_iter.pt')

In [None]:
val_iter.train