In [1]:
from pathlib import Path
import pandas as pd
import fastbook
fastbook.setup_book()
from fastbook import *

## Fetch dataset

https://www.kaggle.com/c/nlp-getting-started/data

In [22]:
dataset_path = Path('../data/nlp-getting-started/')

In [23]:
if not dataset_path.exists():
    raise Error("Make sure to download the dataset first")

In [56]:
df = pd.read_csv(dataset_path / 'train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


## Train / validation split

In [57]:
df['is_valid'] = 0
valid_mask = np.random.rand(len(df)) < 0.1
df.loc[valid_mask, 'is_valid'] = 1
df['is_valid'].value_counts()

0    6831
1     782
Name: is_valid, dtype: int64

## Word Tokenizer

Let's take a look at the default behavior of the fastai tokenizer.

One thing to notice: Hashtags in the tweets are split into the hash symbol and the word. This may or may not be beneficial for the model.

In [11]:
from fastai.text.all import *

In [58]:
spacy = WordTokenizer()
tokens = first(spacy(df.text))
print(coll_repr(tokens, 30))

(#14) ['Our','Deeds','are','the','Reason','of','this','#','earthquake','May','ALLAH','Forgive','us','all']


## Subword tokenizer

In [16]:
def subword(size, texts):
    tokenizer = SubwordTokenizer(vocab_sz=size)
    tokenizer.setup(texts)
    return ' '.join(first(tokenizer([texts[0]]))[:40])

In [59]:
subword(1000, df.text)

'▁O ur ▁De ed s ▁are ▁the ▁Re a son ▁of ▁this ▁# earthquake ▁Ma y ▁A L L A H ▁For g ive ▁us ▁all'

In [60]:
subword(200, df.text)

'▁ O ur ▁D e ed s ▁a re ▁the ▁ R e as on ▁of ▁ th is ▁# e ar th q u a k e ▁M ay ▁A L L A H ▁F or g i v'

In [61]:
subword(10_000, df.text)

'▁Our ▁De ed s ▁are ▁the ▁Reason ▁of ▁this ▁# earthquake ▁May ▁ALL AH ▁For g ive ▁us ▁all'

## Define DataLoaders for language model (map sequence to sequence)

In [65]:
dls_lm = DataBlock(
    blocks=TextBlock.from_df('text', is_lm=True),
    get_x=ColReader('text'),
    splitter=ColSplitter()
).dataloaders(df, bs=64, seq_len=80)

  return array(a, dtype, copy=False, order=order)


In [66]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,xxbos xxunk xxunk xxunk and flattened xxunk . xxmaj xxunk was a xxunk xxunk biker marine not a xxunk xxunk western hero . xxbos xxmaj reddit updates content policy promises to quarantine û÷extremely offensiveûª communities http : / / t.co / xxunk xxbos holy crap xxunk xxrep 3 9 my phone just exploded . haha xxbos xxmaj australia 's xxmaj ashes disaster - how the collapse xxunk at xxmaj trent xxmaj bridge … http : / / t.co / xxunk,xxunk xxunk xxunk and flattened xxunk . xxmaj xxunk was a xxunk xxunk biker marine not a xxunk xxunk western hero . xxbos xxmaj reddit updates content policy promises to quarantine û÷extremely offensiveûª communities http : / / t.co / xxunk xxbos holy crap xxunk xxrep 3 9 my phone just exploded . haha xxbos xxmaj australia 's xxmaj ashes disaster - how the collapse xxunk at xxmaj trent xxmaj bridge … http : / / t.co / xxunk \n▁
1,xxunk in downtown xxmaj xxunk _ http : / / t.co / xxunk ' xxbos xxup closing xxup their xxup eyes xxup to xxup disaster ! xxmaj state xxmaj department xxmaj xxunk of xxmaj reports xxmaj iran is xxmaj xxunk xxmaj nuclear xxmaj sites http : / / t.co / xxunk xxbos xxunk * blight xxbos xxmaj my woman crush xxunk goes to the beautiful xxunk # xxunk xxrep 4 u # xxunk https : / / t.co / xxunk,in downtown xxmaj xxunk _ http : / / t.co / xxunk ' xxbos xxup closing xxup their xxup eyes xxup to xxup disaster ! xxmaj state xxmaj department xxmaj xxunk of xxmaj reports xxmaj iran is xxmaj xxunk xxmaj nuclear xxmaj sites http : / / t.co / xxunk xxbos xxunk * blight xxbos xxmaj my woman crush xxunk goes to the beautiful xxunk # xxunk xxrep 4 u # xxunk https : / / t.co / xxunk xxbos


## Load pre-trained language model and fine tune

In [69]:
learn = language_model_learner(
    dls_lm,
    AWD_LSTM,
    drop_mult=0.3,  # global multiplier to control all dropouts
    metrics=[accuracy, Perplexity()]
).to_fp16()  # why fp 16?





In [71]:
learn.fit_one_cycle(1, 2e-2)



epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.973021,3.431079,0.413574,30.909988,03:52


In [72]:
learn.save('1epoch')

Path('models/1epoch.pth')

In [73]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)



epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.236706,3.266666,0.437012,26.223755,05:44
1,3.091121,3.093074,0.460498,22.044735,16:49
2,2.86785,2.995869,0.482617,20.002728,05:49
3,2.622933,3.048167,0.483545,21.076679,05:42
4,2.318272,3.065799,0.497461,21.451599,05:41
5,2.005485,3.169921,0.49668,23.805595,05:40
6,1.713216,3.282394,0.493555,26.639482,05:40
7,1.474762,3.349734,0.494092,28.495155,05:40
8,1.297634,3.424819,0.492578,30.717093,05:40
9,1.203858,3.427797,0.492822,30.808691,05:40


In [74]:
learn.save_encoder('finetuned')

## Text generation

In [76]:
SEED_TEXT = "I just saw"
N_WORDS = 40
N_SENTENCES = 2
preds = [
    learn.predict(SEED_TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)
]
print('\n'.join(preds))









i just saw my twins twins kids killed in the apocalypse by Snow ! # emmerdale an Accident on # AK Rail Roads East German Fatalities http : / / t.co /
i just saw a picture of a man who looks like a white man . I m not on screen . He is on stage . I 've been bleeding up The Latest : More


## Dataloaders for training the classifier

In [79]:
dls_classifier = DataBlock(
    blocks=(TextBlock.from_df('text', vocab=dls_lm.vocab), CategoryBlock),
    get_x=ColReader('text'),
    get_y=ColReader('target'),
    splitter=ColSplitter()
).dataloaders(df, bs=64, seq_len=80)

  return array(a, dtype, copy=False, order=order)


In [82]:
dls_classifier.show_batch(max_n=5)

Unnamed: 0,text,category
0,xxbos . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : . : xxup rt xxunk : # xxunk \n\n xxmaj indian xxmaj army xxunk _ http : / / t.co / xxunk g,0
1,xxbos i xxmaj hate xxmaj to xxmaj talking xxmaj xxunk xxmaj with xxmaj my xxmaj xxunk … i xxmaj mean i xxmaj love xxmaj her xxmaj as xxmaj to xxmaj death xxmaj but xxmaj she xxmaj talk xxmaj so xxmaj damn xxmaj much xxmaj xxunk xxrep 3 h xxrep 3 e xxunk xxrep 3 ! xxrep 6 ?,0
2,xxbos xxmaj no # news of # hostages in # xxmaj libya \n\n http : / / t.co / xxunk \n\n▁ # xxmaj india # terrorism # xxmaj africa # xxup ap # xxup ts # xxup nri # xxmaj news # xxup trs # xxup tdp # xxup bjp http : / / t.co / xxunk,1
3,xxbos xxmaj truth … \n https : / / t.co / xxunk \n▁ # xxmaj news \n▁ # xxup bbc \n▁ # xxup cnn \n▁ # xxmaj islam \n▁ # xxmaj truth \n▁ # god \n▁ # xxup isis \n▁ # terrorism \n▁ # xxmaj quran \n▁ # xxmaj lies http : / / t.co / xxunk,1
4,xxbos xxmaj truth … \n https : / / t.co / xxunk \n▁ # xxmaj news \n▁ # xxup bbc \n▁ # xxup cnn \n▁ # xxmaj islam \n▁ # xxmaj truth \n▁ # god \n▁ # xxup isis \n▁ # terrorism \n▁ # xxmaj quran \n▁ # xxmaj lies http : / / t.co / xxunk,0


## Text classifier learner

In [84]:
learn = text_classifier_learner(
    dls_classifier,
    AWD_LSTM,
    drop_mult=0.5,
    metrics=accuracy
).to_fp16()
learn = learn.load_encoder('finetuned')



### Start with classification layer only

In [85]:
learn.fit_one_cycle(1, 2e-2)



epoch,train_loss,valid_loss,accuracy,time
0,0.543062,0.431271,0.817136,01:57


### Release final 2 layers

In [87]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4), 1e-2))



epoch,train_loss,valid_loss,accuracy,time
0,0.518377,0.431878,0.826087,02:15


### Release final 3 layers

In [90]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3))



epoch,train_loss,valid_loss,accuracy,time
0,0.48048,0.415738,0.819693,03:54


### Release all layers

In [91]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4), 1e-3))



epoch,train_loss,valid_loss,accuracy,time
0,0.428776,0.406419,0.827366,05:40
1,0.392546,0.407488,0.83376,05:40


In [94]:
learn.save('classifier')

Path('models/classifier.pth')

## Fetch test predictions for kaggle

In [93]:
# TODO