<a href="https://colab.research.google.com/github/chw8207/fastai_pytorch/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece!=0.1.90,!=0.1.91

In [40]:
import tensorflow as tf
from fastai.text.all import *
import numpy as np
import pandas as pd
from IPython.display import display,HTML

### GPU 설정

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0' :
    raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

Found GPU at: /device:GPU:0


In [3]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11602011716167353954
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 40129593344
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2751458526582466392
physical_device_desc: "device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0"
xla_global_id: 416903419
]


### 텍스트 전처리

#### 단어 토큰화

In [4]:
# IMDb 데이터셋
path = untar_data(URLs.IMDB)

In [5]:
# 경로 내 모든 텍스트 파일 가져오기
files = get_text_files(path, folders=['train','test','unsup'])

In [6]:
# 토큰화할 영화 리뷰
txt = files[0].open().read()
txt

'I remember watching this when it was made and thinking it was brilliant at the time. Watching it for the second time nearly 20 years later, I still think Traffik is brilliant and much better than the US film that was based on this drama.<br /><br />It should also be watched by all our come today gone tomorrow politicians who think they can win the war on drugs, as the issues raised in this film are as pertinent today as they were back then at the end of the 80s, and unless they change their policies, will be so in another 20 years.<br /><br />Well written, well made, beautifully acted and superbly filmed. A thought provoking drama that entertains as well as brings to light some of the hard realities of the criminal drug trade.'

In [7]:
# 토크나이저 사용해보기
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#144) ['I','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','Watching','it','for','the','second','time','nearly','20','years','later',',','I','still'...]


In [8]:
# spaCy 예시
first(spacy(['The U.S. dollar #1 is $1.00.']))

(#9) ['The','U.S.','dollar','#','1','is','$','1.00','.']

##### 특수 토큰
- xxbos : 텍스트의 시작
- xxmaj : 다음 단어가 대문자로 시작함
- xxunk : 현재 단어가 목록에 없음

In [9]:
# 추가 기능 살펴보기
# xx~ : 특수 토큰
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt),31))

(#152) ['xxbos','i','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','xxmaj','watching','it','for','the','second','time','nearly','20','years','later',',','i'...]


In [10]:
# 적용된 규칙 확인하기
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [11]:
# 토큰화에 내부적으로 적용된 함수의 작동 살펴보기
coll_repr(tkn('&copy; Fast.ai www.fast.ai/INDEX'),31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index']"

#### 부분 단어 토큰화

In [12]:
# 예시 살펴보기
txts = L(o.open().read() for o in files[:2000])

In [13]:
# setup : 데이터 처리 파이프라인에서 자동으로 호출되는 특별한 fastai메서드
# 주어진 vocab 크기에 대해 setup과정을 수행하고, 토크나이저가 토큰화한 결과를 일부 반환하는 함수
def subword(sz) :
  sp = SubwordTokenizer(vocab_sz=sz)
  sp.setup(txts)
  return ' '.join(first(sp([txt]))[:40])

In [16]:
subword(1000)

'▁I ▁remember ▁watching ▁this ▁when ▁it ▁was ▁made ▁and ▁think ing ▁it ▁was ▁brilliant ▁at ▁the ▁time . ▁W at ch ing ▁it ▁for ▁the ▁second ▁time ▁near ly ▁2 0 ▁years ▁la ter , ▁I ▁still ▁think ▁T ra'

In [17]:
subword(200)

'▁I ▁re m e m b er ▁w at ch ing ▁this ▁w h en ▁it ▁was ▁ma d e ▁and ▁ th in k ing ▁it ▁was ▁b ri ll i an t ▁a t ▁the ▁ t i'

In [18]:
subword(10000)

'▁I ▁remember ▁watching ▁this ▁when ▁it ▁was ▁made ▁and ▁thinking ▁it ▁was ▁brilliant ▁at ▁the ▁time . ▁Watch ing ▁it ▁for ▁the ▁second ▁time ▁near ly ▁20 ▁years ▁later , ▁I ▁still ▁think ▁ Traffik ▁is ▁brilliant ▁and ▁much ▁better'

#### 토큰 수치화

In [19]:
# 단어 토큰화
toks = tkn(txt)
print(coll_repr(tkn(txt),31))

(#152) ['xxbos','i','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','xxmaj','watching','it','for','the','second','time','nearly','20','years','later',',','i'...]


In [21]:
# 데이터셋의 작은 부분집합만 사용하기
toks200 = txts[:200].map(tkn)
toks200[0]

(#152) ['xxbos','i','remember','watching','this','when','it','was','made','and'...]

In [22]:
# 토큰화된 말뭉치를 만들어 setup메서드에 넣어주기
# Numericlize 기본값 : min_freq=3, max_vocab=60000
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#1888) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the',',','.','and','a','of','to','is','in','it','i'...]"

In [23]:
# Numericalize()객체 함수처럼 사용하기
nums = num(toks)[:20]
nums

TensorText([   2,   19,  280,  189,   20,   64,   18,   24,  118,   12, 1072,
              18,   24,  281,   44,    9,   65,   11,    8,  189])

In [24]:
# 변환환 내용을 텍스트로 역변환하여 매핑 제대로 이루어졌는지 확인하기
' '.join(num.vocab[o] for o in nums)

'xxbos i remember watching this when it was made and thinking it was brilliant at the time . xxmaj watching'

#### 언어 모델을 위한 배치 형테의 텍스트 만들기

In [41]:
# 책에 없는 내용
# 길이가 15인 텍스트 배열 6개로 배치 하나를 구성함.
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 \
and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers \
and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen \
we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs, seq_len = 6, 15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,xxmaj,in,this,chapter,",",we,will,go,back,over,the,example,of,classifying
movie,reviews,we,studied,in,chapter,1,and,dig,deeper,under,the,surface,.,xxmaj
first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into,numbers,and
how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have,another,example
of,the,preprocessor,used,in,the,data,block,xxup,api,.,\n,xxmaj,then,we
will,study,how,we,build,a,language,model,and,train,it,for,a,while,.


In [45]:
# 책에 없는 내용
# 길이가 5인 텍스트 배열 6개로 배치 하나 구성함(첫 번째 배치)
bs, seq_len = 6, 5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4
xxbos,xxmaj,in,this,chapter
movie,reviews,we,studied,in
first,we,will,look,at
how,to,customize,it,.
of,the,preprocessor,used,in
will,study,how,we,build


In [46]:
# 두 번째 배치
bs, seq_len = 6, 5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4
",",we,will,go,back
chapter,1,and,dig,deeper
the,processing,steps,necessary,to
xxmaj,by,doing,this,","
the,data,block,xxup,api
a,language,model,and,train


In [47]:
# 마지막 배치
bs, seq_len = 6, 5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))

0,1,2,3,4
over,the,example,of,classifying
under,the,surface,.,xxmaj
convert,text,into,numbers,and
we,'ll,have,another,example
.,\n,xxmaj,then,we
it,for,a,while,.


In [25]:
# 토큰화된 텍스트에 Numericalize객체 적용하기
nums200 = toks200.map(num)

In [26]:
# LMDataLoader 객체 생성하기
dl = LMDataLoader(nums200)

In [27]:
# 첫 번째 배치를 검사하여 예상 결과를 얻었는지 확인해보기
x,y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [29]:
# 독립변수의 첫 번째 행 출력하기
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos i remember watching this when it was made and thinking it was brilliant at the time . xxmaj watching'

In [30]:
# 종속변수 : 토큰 하나만 뒤로 미룬 형태
' '.join(num.vocab[o] for o in y[0][:20])

'i remember watching this when it was made and thinking it was brilliant at the time . xxmaj watching it'

### 텍스트 분류기의 학습

#### 데이터 블록을 사용한 언어 모델

In [31]:
get_imdb = partial(get_text_files, folders=['train','test','unsup'])

In [32]:
dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

In [33]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj this is , without a doubt , the worst movie i have ever seen in my entire life . xxmaj first of all , the title is all wrong . xxmaj the main character is not a nymphoid because , well , she 's just not . xxmaj she is not really a barbarian . xxmaj she is actually pretty much a wimp . xxmaj the dinosaurs are n't actually dinosaurs . xxmaj they 're mutated monsters .","xxmaj this is , without a doubt , the worst movie i have ever seen in my entire life . xxmaj first of all , the title is all wrong . xxmaj the main character is not a nymphoid because , well , she 's just not . xxmaj she is not really a barbarian . xxmaj she is actually pretty much a wimp . xxmaj the dinosaurs are n't actually dinosaurs . xxmaj they 're mutated monsters . xxmaj"
1,"things become stranger yet . \n\n "" a xxmaj night to xxmaj remember "" moves quickly enough , and it 's delightful , but probably a little miscast . xxmaj someone a little wackier than the stunning xxmaj young might have a better choice for the wife . xxmaj aherne , a very good actor , is n't quite at home in this milieu either . "" a xxmaj night to xxmaj remember "" resembles "" footsteps in the xxmaj","become stranger yet . \n\n "" a xxmaj night to xxmaj remember "" moves quickly enough , and it 's delightful , but probably a little miscast . xxmaj someone a little wackier than the stunning xxmaj young might have a better choice for the wife . xxmaj aherne , a very good actor , is n't quite at home in this milieu either . "" a xxmaj night to xxmaj remember "" resembles "" footsteps in the xxmaj dark"


#### 언어 모델 미세 조정

In [49]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3,
    metrics=[accuracy, Perplexity()]).to_fp16()

In [50]:
# fine_one_cycle : 중간 저장 기능 제공
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.99626,3.903087,0.30034,49.55521,19:35


#### 모델을 저장하고 불러오기

In [51]:
# 모델의 상태를 저장하기
learn.save('1epoch')

Path('/root/.fastai/data/imdb/models/1epoch.pth')

In [52]:
# 저장된 모델 불러오기
learn = learn.load('1epoch')