In [1]:
from IPython.core.interactiveshell import InteractiveShell

# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# External Lib imports
import re
import html
import ujson
import pickle
import sklearn
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from pprint import pprint
from functools import partial
from typing import AnyStr, Callable
from sklearn.model_selection import train_test_split

import os
os.environ['QT_QPA_PLATFORM']='offscreen'

# FastAI Imports
from fastai import text, core, lm_rnn

# Torch imports
import torch
import torch.nn as nn
import torch.tensor as T
import torch.nn.functional as F

# Mytorch imports
from mytorch import loops, lriters
from mytorch.utils.goodies import *

device = torch.device('cuda')
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f7308d0ef90>

In [3]:
def get_label_via_parsing(_uri, lower=False):

    # Sanity strip: remove all '<' and '>' from here
    _uri = _uri.replace('<', '')
    _uri = _uri.replace('>', '')

    parsed = urlparse(_uri)
    path = os.path.split(parsed.path)
    unformated_label = path[-1]
    label = convert(unformated_label)
    label = " ".join(label.split("_"))
    if lower:
        return label.lower()
    return label

In [4]:
class CustomEncoder(lm_rnn.RNN_Encoder):

    @property
    def layers(self):
        return torch.nn.ModuleList([
#             torch.nn.ModuleList([self.encoder, self.encoder_with_dropout]),
            torch.nn.ModuleList([self.rnns[0], self.dropouths[0]])
#             torch.nn.ModuleList([self.rnns[1], self.dropouths[1]]),
#             torch.nn.ModuleList([self.rnns[2], self.dropouths[2]])
        ])


class CustomLinear(text.LinearDecoder):

    @property
    def layers(self):
        return torch.nn.ModuleList([self.decoder, self.dropout])


class LanguageModel(nn.Module):

    def __init__(self,
                 _parameter_dict,
                 _device,
#                  _wgts_e,
#                  _wgts_d,
                 _encargs):
        super(LanguageModel, self).__init__()

        self.parameter_dict = _parameter_dict
        self.device = _device

        self.encoder = CustomEncoder(**_encargs).to(self.device)
#         self.encoder.load_state_dict(_wgts_e)
        """
            Explanation:
                400*3 because input is [ h_T, maxpool, meanpool ]
                0.4, 0.1 are drops at various layersLM_PATH
        """
        self.linear = CustomLinear(
            _encargs['ntoken'],
            n_hid=400,
            dropout=0.1 * 0.7,
            tie_encoder=self.encoder.encoder,
            bias=False
        ).to(self.device)
        self.encoder.reset()

    def forward(self, x):
        # Encoding all the data
        op_p = self.encoder(x)

        # pos_batch = op_p[1][-1][-1]
        score = self.linear(op_p)[0]

        return score

    @property
    def layers(self):
        layers = [x for x in self.encoder.layers]
        layers += [x for x in self.linear.layers]
        return torch.nn.ModuleList(layers)

    @property
    def layers_rev(self):
        layers = [x for x in self.encoder.layers]
        layers += [x for x in self.linear.layers]
        layers.reverse()
        return torch.nn.ModuleList(layers)

    def predict(self, x):
        with torch.no_grad():
            self.eval()
            pred = self.forward(x)
            self.train()
            return pred


def eval(y_pred, y_true):
    """
        Expects a batch of input

        :param y_pred: tensor of shape (b, nc)
        :param y_true: tensor of shape (b, 1)
    """
    return torch.mean((torch.argmax(y_pred, dim=1) == y_true).float())

In [5]:
DEBUG = True

# Path fields
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

WIKI_DATA_PATH = Path('raw/wikitext/wikitext-103/')
WIKI_DATA_PATH.mkdir(exist_ok=True)
IMDB_DATA_PATH = Path('raw/imdb/aclImdb/')
IMDB_DATA_PATH.mkdir(exist_ok=True)
PATH = Path('resources/proc/imdb')
DATA_PROC_PATH = PATH / 'data'
DATA_LM_PATH = PATH / 'datalm'

LM_PATH = Path('resources/models')
LM_PATH.mkdir(exist_ok=True)
PRE_PATH = LM_PATH / 'wt103'
PRE_LM_PATH = PRE_PATH / 'fwd_wt103.h5'
CLASSES = ['neg', 'pos', 'unsup']
WIKI_CLASSES = ['wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens']

In [6]:
def get_texts_org(path):
    texts, labels = [], []
    for idx, label in enumerate(CLASSES):
        for fname in (path / label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    return np.array(texts), np.array(labels)

trn_texts, trn_labels = get_texts_org(IMDB_DATA_PATH / 'train')
val_texts, val_labels = get_texts_org(IMDB_DATA_PATH / 'test')
col_names = ['labels', 'text']
print(len(trn_texts), len(val_texts))

75000 25000


In [7]:
def is_valid_sent(x):
    x = x.strip()
    if len(x) == 0: return False
    if x[0] == '=' and x[-1] == '=': return False
    return True
def wiki_get_texts_org(path):
    texts = []
    for idx, label in enumerate(WIKI_CLASSES):
        with open(path / label, encoding='utf-8') as f:
            texts.append([sent.strip() for sent in f.readlines() if is_valid_sent(sent)])
    return tuple(texts)
wiki_trn_texts, wiki_val_texts, wiki_tst_texts = wiki_get_texts_org(WIKI_DATA_PATH)
print(len(wiki_trn_texts), len(wiki_val_texts))

859955 1841


In [8]:
trn_texts, val_texts = trn_texts[:1000], val_texts[:1000]
wiki_trn_texts, wiki_val_texts, wiki_tst_texts = wiki_trn_texts, wiki_val_texts, wiki_tst_texts

In [9]:
# Shuffle data
trn_idx = np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))

trn_texts, trn_labels = trn_texts[trn_idx], trn_labels[trn_idx]
val_texts, val_labels = val_texts[val_idx], val_labels[val_idx]

# Shuffle data (wiki)
np.random.shuffle(wiki_trn_texts)
np.random.shuffle(wiki_val_texts)
np.random.shuffle(wiki_tst_texts)

wiki_trn_labels = [0 for _ in wiki_trn_texts]
wiki_val_labels = [0 for _ in wiki_val_texts]
wiki_tst_labels = [0 for _ in wiki_val_texts]

### Dataframe black magic

In [10]:
chunksize = 24000
re1 = re.compile(r'  +')
def _fixup_(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
def _get_texts_(df, n_lbls=1):
    labels = df.iloc[:, range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls + 1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(_fixup_).values)

    tok = text.Tokenizer().proc_all_mp(core.partition_by_cores(texts))
    return tok, list(labels)

def _simple_apply_fixup_(df):
    labels = [0] * df.shape[0]
    texts = f'\n{BOS} {FLD} 1 ' + df.text
    texts = list(texts.apply(_fixup_).values)
    tok = text.Tokenizer().proc_all_mp(core.partition_by_cores(texts))
    return tok, list(labels)
    
def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = _get_texts_(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels

In [11]:
trn_texts, val_texts = sklearn.model_selection.train_test_split(
        np.concatenate([trn_texts, val_texts]), test_size=0.1)

if DEBUG:
    print(len(trn_texts), len(val_texts))

df_trn = pd.DataFrame({'text': trn_texts, 'labels': [0] * len(trn_texts)}, columns=col_names)
df_val = pd.DataFrame({'text': val_texts, 'labels': [0] * len(val_texts)}, columns=col_names)

trn_tok, trn_labels = _simple_apply_fixup_(df_trn)
val_tok, val_labels = _simple_apply_fixup_(df_val)

if DEBUG:
    print(f"Trn: {len(trn_tok), len(trn_labels)}, Val: {len(val_tok), len(val_labels)} ")

1800 200
Trn: (1800, 1800), Val: (200, 200) 


In [12]:
wiki_trn_texts, wiki_val_texts = sklearn.model_selection.train_test_split(
        np.concatenate([wiki_trn_texts, wiki_val_texts, wiki_tst_texts]), test_size=0.1)

if DEBUG:
    print(len(wiki_trn_texts), len(wiki_val_texts))
    
wiki_df_trn = pd.DataFrame({'text':wiki_trn_texts, 'labels': [0] * len(wiki_trn_texts)}, columns=col_names)
wiki_df_val = pd.DataFrame({'text':wiki_val_texts, 'labels': [0] * len(wiki_val_texts)}, columns=col_names)

wiki_trn_tok, wiki_trn_labels = _simple_apply_fixup_(wiki_df_trn)
wiki_val_tok, wiki_val_labels = _simple_apply_fixup_(wiki_df_val)

if DEBUG:
    print(f"Trn: {len(wiki_trn_tok), len(wiki_trn_labels)}, Val: {len(wiki_val_tok), len(wiki_val_labels)} ")

777582 86399
Trn: (777582, 777582), Val: (86399, 86399) 


In [13]:
'''
    Now we make vocabulary, select 60k most freq words 
        (we do this looking only at imdb, and ignore wiki here)
'''

freq = Counter(p for o in wiki_trn_tok for p in o)
# freq.most_common(25)
max_vocab = 60000
min_freq = 2

itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})
vs = len(itos)

'\n    Now we make vocabulary, select 60k most freq words \n        (we do this looking only at imdb, and ignore wiki here)\n'

In [14]:
vs

60002

In [15]:
trn_lm = np.array([[stoi[o] for o in p] for p in trn_tok])
val_lm = np.array([[stoi[o] for o in p] for p in val_tok])

if DEBUG:
    print(f"ITOS: {len(itos)}, STOI: {len(stoi)}")
    
wiki_trn_lm = np.array([[stoi[o] for o in p] for p in wiki_trn_tok])
wiki_val_lm = np.array([[stoi[o] for o in p] for p in wiki_val_tok])
    
if DEBUG:
    print(f"ITOS: {len(itos)}, STOI: {len(stoi)}")

ITOS: 60002, STOI: 66749
ITOS: 60002, STOI: 268351


In [16]:
"""
    Now we pull pretrained models from disk
"""
em_sz, nh, nl = 400, 256, 1
# row_m = enc_wgts.mean(0)
# PRE_PATH = PATH / 'models' / 'wt103'
# PRE_LM_PATH = PRE_PATH / 'fwd_wt103.h5'
# wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)
# enc_wgts = core.to_np(wgts['0.encoder.weight'])

# itos2 = pickle.load((PRE_PATH / 'itos_wt103.pkl').open('rb'))
# stoi2 = collections.defaultdict(lambda: -1, {v: k for k, v in enumerate(itos2)})
# new_w = np.zeros((vs, em_sz), dtype=np.float32)
# for i, w in enumerate(itos):
#     r = stoi2[w]
#     new_w[i] = enc_wgts[r] if r >= 0 else row_m

# wgts['0.encoder.weight'] = T(new_w)
# wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
# wgts['1.decoder.weight'] = T(np.copy(new_w))
# wgts_enc = {'.'.join(k.split('.')[1:]): val
#             for k, val in wgts.items() if k[0] == '0'}
# wgts_dec = {'.'.join(k.split('.')[1:]): val
#             for k, val in wgts.items() if k[0] == '1'}

'\n    Now we pull pretrained models from disk\n'

In [17]:
wd = 1e-7
bptt = 70
bs = 26
opt_fn = partial(torch.optim.Adam, betas=(0.8, 0.99))  # @TODO: find real optimizer, and params

# Load the pre-trained model
# parameter_dict = {'itos2': itos2}
parameter_dict = {}
dps = list(np.asarray([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.7)
encargs = {'ntoken': vs,
           'emb_sz': 400, 'n_hid': 256,
           'n_layers': 1, 'pad_token': 0,
           'qrnn': False, 'dropouti': dps[0],
           'wdrop': dps[2], 'dropoute': dps[3], 'dropouth': dps[4]}

# For now, lets assume our best lr = 0.001
bestlr = 0.001*5
lm = LanguageModel(parameter_dict, device, encargs)
opt = make_opt(lm, opt_fn, lr=bestlr)

In [18]:
data_fn = partial(text.LanguageModelLoader, bs=bs, bptt=bptt)
data = {'train': np.concatenate(wiki_trn_lm), 'valid': np.concatenate(wiki_val_lm)}
loss_fn = F.cross_entropy

In [19]:
'''
    Schedule

    -> Freeze all but last layer, run for 1 epoch
    -> Unfreeze all of it, and apply discriminative fine-tuning, train normally.
'''
# for grp in opt.param_groups:
#     grp['lr'] = 0.0
# opt.param_groups[-1]['lr'] = 1e-3

lr_args = {'iterations': len(data_fn(data['train']))*1}
# lr_args = {'batches':, 'cycles': 1}
# lr_args = {'iterations': len(data_fn(data['train']))*1, 'cut_frac': 0.1, 'ratio': 32}
lr_schedule = lriters.LearningRateScheduler(opt, lr_args, lriters.ConstantLR)

args = {'epochs': 10, 'weight_decay': 0, 'data': data,
        'device': device, 'opt': opt, 'loss_fn': loss_fn, 'train_fn': lm,
        'predict_fn': lm.predict, 'data_fn': data_fn, 'model': lm,
        'eval_fn': eval, 'epoch_start_hook': partial(loops.reset_hidden, lm),
        'clip_grads_at': 0.7, 'lr_schedule': lr_schedule}
traces_start = loops.generic_loop(**args)

'\n    Schedule\n\n    -> Freeze all but last layer, run for 1 epoch\n    -> Unfreeze all of it, and apply discriminative fine-tuning, train normally.\n'

 17%|█▋        | 18333/110482 [10:47<57:17, 26.81it/s]  

KeyboardInterrupt: 

 17%|█▋        | 18333/110482 [11:00<57:17, 26.81it/s]