In [1]:
#hide
# ! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from IPython.display import display,HTML

# NLP Deep Dive: RNNs

## Text Preprocessing

### Tokenization

### Word Tokenization with fastai

In [3]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [4]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [5]:
files

(#100000) [Path('/root/.fastai/data/imdb/test/neg/6201_1.txt'),Path('/root/.fastai/data/imdb/test/neg/11881_2.txt'),Path('/root/.fastai/data/imdb/test/neg/11690_1.txt'),Path('/root/.fastai/data/imdb/test/neg/7300_1.txt'),Path('/root/.fastai/data/imdb/test/neg/2931_1.txt'),Path('/root/.fastai/data/imdb/test/neg/11165_4.txt'),Path('/root/.fastai/data/imdb/test/neg/1026_2.txt'),Path('/root/.fastai/data/imdb/test/neg/11499_3.txt'),Path('/root/.fastai/data/imdb/test/neg/3201_1.txt'),Path('/root/.fastai/data/imdb/test/neg/429_3.txt')...]

In [7]:
txt = files[0].open().read(); txt[:75]

"I didn't know what to expect when I started watching this movie, by the end"

In [8]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#212) ['I','did',"n't",'know','what','to','expect','when','I','started','watching','this','movie',',','by','the','end','of','it','I','was','pulling','my','hairs','out','.','This','was','one','of'...]


In [9]:
first(spacy(['The U.S. dollar 1is1 is 1.00.']))

(#7) ['The','U.S.','dollar','1is1','is','1.00','.']

In [11]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))

(#231) ['xxbos','i','did',"n't",'know','what','to','expect','when','i','started','watching','this','movie',',','by','the','end','of','it','i','was','pulling','my','hairs','out','.','xxmaj','this','was','one'...]


In [12]:
print(coll_repr(tkn('The U.S. dollar 1is1 is 1.00.'), 31))

(#11) ['xxbos','xxmaj','the','xxup','u.s','.','dollar','1is1','is','1.00','.']


In [13]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [14]:
coll_repr(tkn('&copy;   Fast.ai www.fast.ai/INDEX'), 31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index']"

### Subword Tokenization

In [15]:
txts = L(o.open().read() for o in files[:2000])
# L: Behaves like a list of items but can also index with list of indices or masks
# 可以视作数组，有index

In [16]:
txts[0]

"I didn't know what to expect when I started watching this movie, by the end of it I was pulling my hairs out. This was one of the most pathetic movies of this year...in fact, in the last ten years. David Dhawan should just give up his career as a director. I am yet to come across one original script that David Dhawan has worked on. This one was a complete bit y bit rip off Hitch. I have nothing against remakes as such, but this one is just so lousy that it makes you even hate the original one (which was pretty decent). I fail to understand what actors like Salman and Govinda saw in this script. I read somewhere, that this was supposed to be Govinda's comeback vehicle. If thats true, then only God can save his career. Salman just overacted to the hilt. Govinda who I think is an actor of very high caliber was completely wasted. Katrina Kaif and LAra Dutta had nothing to do apart form wearing designer clothes and smiling for no rhyme or reason. Please stay away form this one!"

In [17]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [18]:
subword(1000)

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=tmp/texts.out --vocab_size=1000 --model_prefix=tmp/spm --character_coverage=0.99999 --model_type=unigram --unk_id=9 --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2 --user_defined_symbols=▁xxunk,▁xxpad,▁xxbos,▁xxeos,▁xxfld,▁xxrep,▁xxwrep,▁xxup,▁xxmaj --hard_vocab_limit=false


"▁I ▁didn ' t ▁know ▁what ▁to ▁expect ▁when ▁I ▁start ed ▁watching ▁this ▁movie , ▁by ▁the ▁end ▁of ▁it ▁I ▁was ▁p u ll ing ▁my ▁ ha ir s ▁out . ▁This ▁was ▁one ▁of ▁the ▁most"

In [19]:
subword(200)

"▁I ▁di d n ' t ▁ k n o w ▁w h at ▁to ▁ e x p e c t ▁w h en ▁I ▁st ar t ed ▁w at ch ing ▁this ▁movie , ▁ b y"

In [20]:
subword(10000)

"▁I ▁didn ' t ▁know ▁what ▁to ▁expect ▁when ▁I ▁started ▁watching ▁this ▁movie , ▁by ▁the ▁end ▁of ▁it ▁I ▁was ▁pull ing ▁my ▁hair s ▁out . ▁This ▁was ▁one ▁of ▁the ▁most ▁pathetic ▁movies ▁of ▁this ▁year"

### Numericalization with fastai

In [21]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#231) ['xxbos','i','did',"n't",'know','what','to','expect','when','i','started','watching','this','movie',',','by','the','end','of','it','i','was','pulling','my','hairs','out','.','xxmaj','this','was','one'...]


对txt中前200个文件进行处理，使用tkn这个tokenizer对所有的文件进行tokenize

In [22]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#231) ['xxbos','i','did',"n't",'know','what','to','expect','when','i'...]

In [23]:
toks200

(#200) [['xxbos', 'i', 'did', "n't", 'know', 'what', 'to', 'expect', 'when', 'i', 'started', 'watching', 'this', 'movie', ',', 'by', 'the', 'end', 'of', 'it', 'i', 'was', 'pulling', 'my', 'hairs', 'out', '.', 'xxmaj', 'this', 'was', 'one', 'of', 'the', 'most', 'pathetic', 'movies', 'of', 'this', 'year', '…', 'in', 'fact', ',', 'in', 'the', 'last', 'ten', 'years', '.', 'xxmaj', 'david', 'xxmaj', 'dhawan', 'should', 'just', 'give', 'up', 'his', 'career', 'as', 'a', 'director', '.', 'i', 'am', 'yet', 'to', 'come', 'across', 'one', 'original', 'script', 'that', 'xxmaj', 'david', 'xxmaj', 'dhawan', 'has', 'worked', 'on', '.', 'xxmaj', 'this', 'one', 'was', 'a', 'complete', 'bit', 'y', 'bit', 'rip', 'off', 'xxmaj', 'hitch', '.', 'i', 'have', 'nothing', 'against', 'remakes', 'as', 'such', ',', 'but', 'this', 'one', 'is', 'just', 'so', 'lousy', 'that', 'it', 'makes', 'you', 'even', 'hate', 'the', 'original', 'one', '(', 'which', 'was', 'pretty', 'decent', ')', '.', 'i', 'fail', 'to', 'understa

关于Numericalize的setup函数：
在代码中是这样计数的：
```Python
count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
```
Counter中传入的要求是一个二维数组/数组内还有数组，如果传入的是`toks200[0]`的话counter中的dsets是列表，o是单词，p是每个字符abcd...，所以输出的结果中全是单字母形式；如果传入的是`toks200`，那么counter中的dsets是二维列表，o就是每个`toks200[i]`列表，p才是单词。

In [24]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#1880) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','of','and','to','is','it','i','this'...]"

In [25]:
nums = num(toks)[:20]; nums

TensorText([  2,  18,  91,  33, 144,  62,  15, 452,  72,  18, 925, 151,  19,  27,  11,  68,   9, 135,  13,  17])

In [26]:
' '.join(num.vocab[o] for o in nums)

"xxbos i did n't know what to expect when i started watching this movie , by the end of it"

### Putting Our Texts into Batches for a Language Model

In [29]:
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,xxmaj,in,this,chapter,",",we,will,go,back,over,the,example,of,classifying
movie,reviews,we,studied,in,chapter,1,and,dig,deeper,under,the,surface,.,xxmaj
first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into,numbers,and
how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have,another,example
of,the,preprocessor,used,in,the,data,block,xxup,api,.,\n,xxmaj,then,we
will,study,how,we,build,a,language,model,and,train,it,for,a,while,.


In [30]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
xxbos,xxmaj,in,this,chapter
movie,reviews,we,studied,in
first,we,will,look,at
how,to,customize,it,.
of,the,preprocessor,used,in
will,study,how,we,build


In [31]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
",",we,will,go,back
chapter,1,and,dig,deeper
the,processing,steps,necessary,to
xxmaj,by,doing,this,","
the,data,block,xxup,api
a,language,model,and,train


In [32]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
over,the,example,of,classifying
under,the,surface,.,xxmaj
convert,text,into,numbers,and
we,'ll,have,another,example
.,\n,xxmaj,then,we
it,for,a,while,.


In [33]:
nums200 = toks200.map(num)

In [34]:
len(nums200),len(nums200[0])

(200, 231)

In [35]:
dl = LMDataLoader(nums200)

In [36]:
len(nums200)

200

In [37]:
total_lens=0
for i in nums200:
    total_lens+=len(i)
print(total_lens)

57242


In [38]:
len(dl)

13

In [39]:
13*64*72

59904

In [40]:
x,y = first(dl)
x.shape,y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [41]:
' '.join(num.vocab[o] for o in x[0][:20])

"xxbos i did n't know what to expect when i started watching this movie , by the end of it"

In [42]:
' '.join(num.vocab[o] for o in y[0][:20])

"i did n't know what to expect when i started watching this movie , by the end of it i"

## Training a Text Classifier

### Language Model Using DataBlock

In [43]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

In [44]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj the xxmaj scottish artist xxmaj andy xxmaj goldsworthy fashions natural materials into ephemeral artworks , assembling rocks into egg - shaped cairns , filling riverside rock - pools with fiery flowers and stitching thorns and twigs into intricate web patterns . xxmaj an original work and a few photographs of his other creations are tucked away in a corner of xxmaj southampton art gallery ( near where i live ) , but although i found these pieces intriguing","xxmaj the xxmaj scottish artist xxmaj andy xxmaj goldsworthy fashions natural materials into ephemeral artworks , assembling rocks into egg - shaped cairns , filling riverside rock - pools with fiery flowers and stitching thorns and twigs into intricate web patterns . xxmaj an original work and a few photographs of his other creations are tucked away in a corner of xxmaj southampton art gallery ( near where i live ) , but although i found these pieces intriguing ,"
1,""" part of the movie . xxmaj dwight xxmaj yoakum 's performance is also good but xxmaj levon steals the movie . \n\n xxup big xxmaj problem : xxmaj tommy xxmaj lee xxmaj jones has no range . xxmaj he has an unsympathetic constant deadpan delivery . xxmaj should n't be allowed to carry a movie . \n\n xxmaj wait for it to come out on video . xxmaj finally , for those who hype the "" new xxmaj western","part of the movie . xxmaj dwight xxmaj yoakum 's performance is also good but xxmaj levon steals the movie . \n\n xxup big xxmaj problem : xxmaj tommy xxmaj lee xxmaj jones has no range . xxmaj he has an unsympathetic constant deadpan delivery . xxmaj should n't be allowed to carry a movie . \n\n xxmaj wait for it to come out on video . xxmaj finally , for those who hype the "" new xxmaj western """


### Fine-Tuning the Language Model

In [45]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [46]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.001635,3.898766,0.30149,49.341503,08:33


### Saving and Loading Models

In [47]:
learn.save('1epoch')

Path('/root/.fastai/data/imdb/models/1epoch.pth')

In [48]:
learn = learn.load('1epoch')

In [49]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.755517,3.756951,0.317714,42.817669,08:50
1,3.695737,3.698826,0.32438,40.39983,09:02
2,3.644548,3.648346,0.329566,38.411079,08:55
3,3.566868,3.615654,0.333846,37.175632,08:48
4,3.489898,3.597493,0.336219,36.506603,08:47
5,3.428813,3.575994,0.339054,35.730118,08:43
6,3.365512,3.568871,0.340482,35.476509,08:43
7,3.299841,3.565679,0.341508,35.363453,08:47
8,3.250631,3.568701,0.341841,35.470497,08:44
9,3.22053,3.573685,0.341573,35.647701,08:50


In [52]:
learn.save_encoder('finetuned')

In [53]:
learn.path

Path('/root/.fastai/data/imdb')

### Text Generation

In [54]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

In [55]:
print("\n".join(preds))

i liked this movie because it was a " b " movie . If you like some of the Hollywood " hollywood " approach to film - making , then you will hate this . If you liked Cabin Fever
i liked this movie because it taught me a lot about love and a happy life . i liked it at the same time as it was one of my favourites . a classic for all ages . The only thing that please make


### Creating the Classifier DataLoaders

In [56]:
dls_clas = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [57]:
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos * * attention xxmaj spoilers * * \n\n xxmaj first of all , let me say that xxmaj rob xxmaj roy is one of the best films of the 90 's . xxmaj it was an amazing achievement for all those involved , especially the acting of xxmaj liam xxmaj neeson , xxmaj jessica xxmaj lange , xxmaj john xxmaj hurt , xxmaj brian xxmaj cox , and xxmaj tim xxmaj roth . xxmaj michael xxmaj canton xxmaj jones painted a wonderful portrait of the honor and dishonor that men can represent in themselves . xxmaj but alas … \n\n it constantly , and unfairly gets compared to "" braveheart "" . xxmaj these are two entirely different films , probably only similar in the fact that they are both about xxmaj scots in historical xxmaj scotland . xxmaj yet , this comparison frequently bothers me because it seems",pos
2,"xxbos xxmaj by now you 've probably heard a bit about the new xxmaj disney dub of xxmaj miyazaki 's classic film , xxmaj laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky . xxmaj during late summer of 1998 , xxmaj disney released "" kiki 's xxmaj delivery xxmaj service "" on video which included a preview of the xxmaj laputa dub saying it was due out in "" 1 xxrep 3 9 "" . xxmaj it 's obviously way past that year now , but the dub has been finally completed . xxmaj and it 's not "" laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky "" , just "" castle xxmaj in xxmaj the xxmaj sky "" for the dub , since xxmaj laputa is not such a nice word in xxmaj spanish ( even though they use the word xxmaj laputa many times",pos


In [58]:
nums_samp = toks200[:10].map(num)

In [59]:
nums_samp.map(len)

(#10) [231,260,148,335,137,363,155,299,145,60]

In [60]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

In [61]:
learn = learn.load_encoder('finetuned')

### Fine-Tuning the Classifier

In [62]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.246149,0.17819,0.93072,00:14


In [63]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.221013,0.163485,0.9378,00:16


In [64]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.190483,0.148618,0.94408,00:19


In [65]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.159024,0.148037,0.94524,00:24
1,0.148517,0.14963,0.94472,00:23


## Disinformation and Language Models

## Conclusion

## Questionnaire

1. What is "self-supervised learning"?
1. What is a "language model"?
1. Why is a language model considered self-supervised?
1. What are self-supervised models usually used for?
1. Why do we fine-tune language models?
1. What are the three steps to create a state-of-the-art text classifier?
1. How do the 50,000 unlabeled movie reviews help us create a better text classifier for the IMDb dataset?
1. What are the three steps to prepare your data for a language model?
1. What is "tokenization"? Why do we need it?
1. Name three different approaches to tokenization.
1. What is `xxbos`?
1. List four rules that fastai applies to text during tokenization.
1. Why are repeated characters replaced with a token showing the number of repetitions and the character that's repeated?
1. What is "numericalization"?
1. Why might there be words that are replaced with the "unknown word" token?
1. With a batch size of 64, the first row of the tensor representing the first batch contains the first 64 tokens for the dataset. What does the second row of that tensor contain? What does the first row of the second batch contain? (Careful—students often get this one wrong! Be sure to check your answer on the book's website.)
1. Why do we need padding for text classification? Why don't we need it for language modeling?
1. What does an embedding matrix for NLP contain? What is its shape?
1. What is "perplexity"?
1. Why do we have to pass the vocabulary of the language model to the classifier data block?
1. What is "gradual unfreezing"?
1. Why is text generation always likely to be ahead of automatic identification of machine-generated texts?

### Further Research

1. See what you can learn about language models and disinformation. What are the best language models today? Take a look at some of their outputs. Do you find them convincing? How could a bad actor best use such a model to create conflict and uncertainty?
1. Given the limitation that models are unlikely to be able to consistently recognize machine-generated texts, what other approaches may be needed to handle large-scale disinformation campaigns that leverage deep learning?