<a href="https://colab.research.google.com/github/chw8207/fastai_pytorch/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece!=0.1.90,!=0.1.91

In [1]:
import tensorflow as tf
from fastai.text.all import *

### GPU 설정

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0' :
    raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

Found GPU at: /device:GPU:0


In [3]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6867392105482851747
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 40129593344
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17405535763795452091
physical_device_desc: "device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0"
xla_global_id: 416903419
]


### 텍스트 전처리

#### 단어 토큰화

In [4]:
# IMDb 데이터셋
path = untar_data(URLs.IMDB)

In [5]:
# 경로 내 모든 텍스트 파일 가져오기
files = get_text_files(path, folders=['train','test','unsup'])

In [11]:
# 토큰화할 영화 리뷰
txt = files[0].open().read()
txt

'I remember watching this when it was made and thinking it was brilliant at the time. Watching it for the second time nearly 20 years later, I still think Traffik is brilliant and much better than the US film that was based on this drama.<br /><br />It should also be watched by all our come today gone tomorrow politicians who think they can win the war on drugs, as the issues raised in this film are as pertinent today as they were back then at the end of the 80s, and unless they change their policies, will be so in another 20 years.<br /><br />Well written, well made, beautifully acted and superbly filmed. A thought provoking drama that entertains as well as brings to light some of the hard realities of the criminal drug trade.'

In [13]:
# 토크나이저 사용해보기
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#144) ['I','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','Watching','it','for','the','second','time','nearly','20','years','later',',','I','still'...]


In [14]:
# spaCy 예시
first(spacy(['The U.S. dollar #1 is $1.00.']))

(#9) ['The','U.S.','dollar','#','1','is','$','1.00','.']

##### 특수 토큰
- xxbos : 텍스트의 시작
- xxmaj : 다음 단어가 대문자로 시작함
- xxunk : 현재 단어가 목록에 없음

In [15]:
# 추가 기능 살펴보기
# xx~ : 특수 토큰
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt),31))

(#152) ['xxbos','i','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','xxmaj','watching','it','for','the','second','time','nearly','20','years','later',',','i'...]


In [16]:
# 적용된 규칙 확인하기
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [18]:
# 토큰화에 내부적으로 적용된 함수의 작동 살펴보기
coll_repr(tkn('&copy; Fast.ai www.fast.ai/INDEX'),31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index']"

#### 부분 단어 토큰화

In [20]:
# 예시 살펴보기
txts = L(o.open().read() for o in files[:2000])

In [23]:
# setup : 데이터 처리 파이프라인에서 자동으로 호출되는 특별한 fastai메서드
# 주어진 vocab 크기에 대해 setup과정을 수행하고, 토크나이저가 토큰화한 결과를 일부 반환하는 함수
def subword(sz) :
  sp = SubwordTokenizer(vocab_sz=sz)
  sp.setup(txts)
  return ' '.join(first(sp([txt]))[:40])

In [26]:
subword(1000)

'▁I ▁remember ▁watching ▁this ▁when ▁it ▁was ▁made ▁and ▁think ing ▁it ▁was ▁brilliant ▁at ▁the ▁time . ▁W at ch ing ▁it ▁for ▁the ▁second ▁time ▁near ly ▁2 0 ▁years ▁la ter , ▁I ▁still ▁think ▁T ra'

In [27]:
subword(200)

'▁I ▁re m e m b er ▁w at ch ing ▁this ▁w h en ▁it ▁was ▁ma d e ▁and ▁ th in k ing ▁it ▁was ▁b ri ll i an t ▁a t ▁the ▁ t i'

In [28]:
subword(10000)

'▁I ▁remember ▁watching ▁this ▁when ▁it ▁was ▁made ▁and ▁thinking ▁it ▁was ▁brilliant ▁at ▁the ▁time . ▁Watch ing ▁it ▁for ▁the ▁second ▁time ▁near ly ▁20 ▁years ▁later , ▁I ▁still ▁think ▁ Traffik ▁is ▁brilliant ▁and ▁much ▁better'

#### 토큰 수치화

In [29]:
# 단어 토큰화
toks = tkn(txt)
print(coll_repr(tkn(txt),31))

(#152) ['xxbos','i','remember','watching','this','when','it','was','made','and','thinking','it','was','brilliant','at','the','time','.','xxmaj','watching','it','for','the','second','time','nearly','20','years','later',',','i'...]
