In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import os
import numpy as np
import torchtext
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from transformers import *
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split
%matplotlib inline
# import spacy

In [2]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

https://towardsdatascience.com/how-to-use-torchtext-for-neural-machine-translation-plus-hack-to-make-it-5x-faster-77f3884d95

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'utils.py',
 'aclImdb',
 'preprocessing.ipynb',
 'README.md',
 'model.py',
 'train_data.csv',
 'classifier.py',
 '.git',
 'join_datasets.ipynb',
 'base.py']

In [4]:
data = pd.read_csv('train_data.csv')
data['L'] = data['text'].str.count(' ')

In [6]:
data['L'].describe()

count    10000.00000
mean       230.10900
std        170.06096
min          9.00000
25%        125.00000
50%        172.00000
75%        280.00000
max       1526.00000
Name: L, dtype: float64

In [7]:
data = data.query('L < 100')

In [8]:
data.shape

(1168, 3)

In [9]:
train, cv = train_test_split(data, test_size=0.2)

In [10]:
train.to_csv('train.csv', index=None)
cv.to_csv('cv.csv', index=None)

In [11]:
train.shape, cv.shape

((934, 3), (234, 3))

In [12]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
         ]

In [13]:
model_class, tokenizer_class, weights = MODELS[-2]

In [14]:
tokenizer = tokenizer_class.from_pretrained(weights)

In [15]:
tokenize = lambda x: tokenizer.tokenize(x)

In [16]:
tokenize('Will is a massive cunt')

['will', 'is', 'a', 'massive', 'cu', '##nt']

In [None]:
# en = spacy.load('en_core_web_sm')
# def tokenize(sentence):
#     return [tok.text for tok in en.tokenizer(sentence)]

In [None]:
# from torchtext.vocab import Vocab

In [17]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [None]:
Field()

In [None]:
TEXT = Field(sequential=True, 
             tokenize=tokenize, 
             use_vocab=True,
             init_token = tokenizer.cls_token,
             pad_token=tokenizer.pad_token, 
             unk_token=tokenizer.unk_token,
             pad_first=False, 
             batch_first=True)
LABEL = Field(use_vocab=False, sequential=False)

In [None]:
datafields = [('text', TEXT), ('label', LABEL)]

In [None]:
trn, cv = TabularDataset.splits(path='.',
                                train='train.csv', 
                                validation='cv.csv', 
                                format='csv', 
                                skip_header=True, 
                                fields=datafields)

In [None]:
TEXT.build_vocab(trn, cv)

In [None]:
stoi = dict(tokenizer.vocab)

In [None]:
itos = list(stoi.keys()) 

In [None]:
TEXT.vocab.stoi = stoi

In [None]:
TEXT.vocab.itos = itos

In [None]:
train_iter, val_iter = BucketIterator.splits(
 (trn, cv), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(6, 6),
 device=torch.device('cpu'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=True,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [None]:
batch = next(iter(train_iter))

In [None]:
batch.text

In [None]:
tokenizer.decode(batch.text[4].tolist())

In [None]:
batch.label[0]

In [None]:
for data in train_iter:
    x = data
    break

In [None]:
x.text.shape

In [None]:
model = model_class.from_pretrained(weights)

In [None]:
model.eval()

In [None]:
res = model(x.text)[0]

In [None]:
res.shape

### Export

In [None]:
train_iter, val_iter = BucketIterator.splits(
 (trn, cv), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(6, 6),
 device=torch.device('cuda'), # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [None]:
torch.save(train_iter, 'train_iter.pt')
torch.save(val_iter, 'cv_iter.pt')

In [None]:
val_iter.train