# 0. Install

In [4]:
## Colab
! curl -s https://course.fast.ai/setup/colab | bash


Updating fastai...
Done.


In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# 1. Import

In [0]:
from fastai import *
from fastai.text import *
import sklearn.feature_extraction.text as sklearn_text

# 2. Dataset

In [10]:
path = untar_data(URLs.IMDB_SAMPLE)
path

PosixPath('/content/data/imdb_sample')

In [11]:
path.ls()

[PosixPath('/content/data/imdb_sample/texts.csv')]

In [84]:
df = pd.read_csv(path/'texts.csv')
df.shape

(1000, 3)

In [85]:
df.head()

Unnamed: 0,label,text,is_valid
0,negative,Un-bleeping-believable! Meg Ryan doesn't even ...,False
1,positive,This is a extremely well-made film. The acting...,False
2,negative,Every once in a long while a movie will come a...,False
3,positive,Name just says it all. I watched this movie wi...,False
4,negative,This movie succeeds at being one of the most u...,False


# 3. Preprocessing

In [0]:
movie_reviews = (TextList.from_csv(path, 'texts.csv', cols='text').split_from_df(col=2).label_from_df(cols=0))

In [17]:
movie_reviews.valid.y[0], movie_reviews.valid.x[0]

(Category positive,
 Text xxbos xxmaj this very funny xxmaj british comedy shows what might happen if a section of xxmaj london , in this case xxmaj xxunk , were to xxunk itself independent from the rest of the xxup uk and its laws , xxunk & post - war xxunk . xxmaj merry xxunk is what would happen . 
  
   xxmaj the explosion of a wartime bomb leads to the xxunk of ancient xxunk which show that xxmaj xxunk was xxunk to the xxmaj xxunk of xxmaj xxunk xxunk ago , a small historical xxunk long since forgotten . xxmaj to the new xxmaj xxunk , however , this is an unexpected opportunity to live as they please , free from any xxunk from xxmaj xxunk . 
  
   xxmaj stanley xxmaj xxunk is excellent as the minor city xxunk who suddenly finds himself leading one of the world 's xxunk xxunk . xxmaj xxunk xxmaj margaret xxmaj xxunk is a delight as the history professor who sides with xxmaj xxunk . xxmaj others in the stand - out cast include xxmaj xxunk xxmaj xxunk , xxmaj paul xxmaj xxunk , xxmaj

In [19]:
len(movie_reviews.train.x), len(movie_reviews.valid.x)

(800, 200)

In [20]:
len(movie_reviews.vocab.itos), len(movie_reviews.vocab.stoi)

(6008, 19161)

In [21]:
movie_reviews.vocab.stoi['love']

142

In [22]:
movie_reviews.vocab.itos[142]

'love'

In [24]:
movie_reviews.vocab.itos[140:149]

['life', 'characters', 'love', 'your', 'here', 'know', 'scenes', 'best', 'end']

In [28]:
movie_reviews.vocab.itos[6000:]

['wtc',
 'portuguese',
 'della',
 'contractor',
 'coaxes',
 'mabuse',
 'greyson',
 'sollett']

In [30]:
movie_reviews.vocab.itos[:20]

['xxunk',
 'xxpad',
 'xxbos',
 'xxeos',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 'the',
 '.',
 ',',
 'and',
 'a',
 'of',
 'to',
 'is',
 'it',
 'in',
 'i']

In [36]:
from itertools import *
list(islice(movie_reviews.vocab.stoi.items(), 20))

[('xxunk', 0),
 ('xxpad', 1),
 ('xxbos', 2),
 ('xxeos', 3),
 ('xxfld', 4),
 ('xxmaj', 5),
 ('xxup', 6),
 ('xxrep', 7),
 ('xxwrep', 8),
 ('the', 9),
 ('.', 10),
 (',', 11),
 ('and', 12),
 ('a', 13),
 ('of', 14),
 ('to', 15),
 ('is', 16),
 ('it', 17),
 ('in', 18),
 ('i', 19)]

In [40]:
i = movie_reviews.vocab.stoi['bualabs']
movie_reviews.vocab.itos[i], i

('xxunk', 0)

In [42]:
i = movie_reviews.vocab.stoi['suvarnabhumi']
movie_reviews.vocab.itos[i], i

('xxunk', 0)

In [43]:
i = movie_reviews.vocab.stoi['airport']
movie_reviews.vocab.itos[i], i

('airport', 4978)

In [0]:
t = movie_reviews.train[0][0]

In [48]:
t.data[:30]

array([   2,    5, 4619,   25,    0,   25,  867,   52,    5, 3776,    5, 1800,   95,   37,   85,  191,   64,  935,
          0, 2738,  517,   18,   21,   11,   84, 2417,  192,   88, 3777,   64])

# 4. Creating Term-Document Matrix

## 4.1 Counter

In [50]:
c = Counter([1, 2, 3, 4, 5, 
             1, 2, 3, 4, 
             1, 2, 3, 
             1, 2, 
             1])
c

Counter({1: 5, 2: 4, 3: 3, 4: 2, 5: 1})

In [53]:
c.keys(), c.values()

(dict_keys([1, 2, 3, 4, 5]), dict_values([5, 4, 3, 2, 1]))

## 4.2 Sparse Matrix

What is Sparse Matrix?

In [54]:
Counter(movie_reviews.valid.x[0].data)

Counter({0: 32,
         2: 1,
         5: 32,
         6: 1,
         9: 10,
         10: 7,
         11: 10,
         12: 1,
         13: 4,
         14: 6,
         15: 6,
         16: 4,
         18: 2,
         20: 1,
         21: 3,
         23: 1,
         24: 3,
         25: 2,
         26: 1,
         27: 3,
         30: 1,
         44: 1,
         45: 1,
         49: 1,
         50: 3,
         52: 1,
         54: 2,
         58: 1,
         59: 1,
         63: 2,
         71: 1,
         74: 1,
         77: 1,
         84: 1,
         109: 1,
         115: 1,
         149: 1,
         189: 1,
         194: 1,
         197: 2,
         204: 1,
         207: 1,
         221: 1,
         239: 1,
         251: 1,
         258: 1,
         285: 1,
         288: 1,
         319: 1,
         324: 1,
         337: 1,
         358: 1,
         378: 1,
         404: 1,
         409: 1,
         430: 1,
         456: 1,
         478: 1,
         541: 1,
         571: 1,
         579: 1

In [60]:
idx = [0, 2, 5, 6, 9]
[movie_reviews.vocab.itos[i] for i in idx]

['xxunk', 'xxbos', 'xxmaj', 'xxup', 'the']

In [69]:
movie_reviews.valid.y[1], movie_reviews.valid.x[1]

(Category positive,
 Text xxbos i saw this movie once as a kid on the late - late show and fell in love with it . 
  
   xxmaj it took 30 + years , but i recently did find it on xxup dvd - it was n't cheap , either - in a xxunk that xxunk in war movies . xxmaj we watched it last night for the first time . xxmaj the audio was good , however it was grainy and had the trailers between xxunk . xxmaj even so , it was better than i remembered it . i was also impressed at how true it was to the play . 
  
   xxmaj the xxunk is around here xxunk . xxmaj if you 're xxunk in finding it , fire me a xxunk and i 'll see if i can get you the xxunk . xxunk)

In [71]:
movie_reviews.valid.y[1].data, movie_reviews.valid.x[1].data

(1, array([  2,  19, 248,  21, ...,   9,   0,  10,   0]))

In [0]:
def get_term_doc_matrix(label_list, vocab_len):
    j_indices = []
    indptr = []
    values = []
    indptr.append(0)

    for i, doc in enumerate(label_list):
        feature_counter = Counter(doc.data)
        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))

    # return (values, j_indices, indptr)

    return scipy.sparse.csr_matrix((values, j_indices, indptr), 
                                   shape=(len(indptr)-1, vocab_len), 
                                   dtype=int)

In [79]:
%%time
val_term_doc = get_term_doc_matrix(movie_reviews.valid.x, len(movie_reviews.vocab.itos))

CPU times: user 50.9 ms, sys: 331 µs, total: 51.3 ms
Wall time: 47.9 ms


In [80]:
%%time
trn_term_doc = get_term_doc_matrix(movie_reviews.train.x, len(movie_reviews.vocab.itos))

CPU times: user 172 ms, sys: 15.1 ms, total: 187 ms
Wall time: 175 ms


In [82]:
trn_term_doc.shape

(800, 6008)

In [90]:
val_term_doc.shape

(200, 6008)

## 4.3 Sparse Matrix to Dense Matrix

In [86]:
trn_term_doc[:, -10:]

<800x10 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [93]:
trn_term_doc.todense()[:10, :10]

matrix([[ 8,  0,  1,  0, ...,  0,  0,  0,  2],
        [22,  0,  1,  0, ...,  2,  0,  0, 27],
        [ 4,  0,  1,  0, ...,  2,  0,  0,  5],
        [13,  0,  1,  0, ...,  0,  0,  0, 16],
        ...,
        [ 4,  0,  1,  0, ...,  0,  0,  0, 19],
        [42,  0,  1,  0, ..., 14,  0,  0, 30],
        [18,  0,  1,  0, ...,  0,  0,  0, 15],
        [20,  0,  1,  0, ...,  1,  0,  0, 10]])

## 4.4 Data Exploration

In [91]:
movie_reviews.vocab.itos[-1:]

['sollett']

In [94]:
movie_reviews.vocab.itos[3]

'xxeos'

เราจะมาดูตัวอย่างข้อความ Record ที่ 1 ใน Validation Set มีคำว่า Late 2 คำ ในข้อความ

In [95]:
review = movie_reviews.valid.x[1]; review

Text xxbos i saw this movie once as a kid on the late - late show and fell in love with it . 
 
  xxmaj it took 30 + years , but i recently did find it on xxup dvd - it was n't cheap , either - in a xxunk that xxunk in war movies . xxmaj we watched it last night for the first time . xxmaj the audio was good , however it was grainy and had the trailers between xxunk . xxmaj even so , it was better than i remembered it . i was also impressed at how true it was to the play . 
 
  xxmaj the xxunk is around here xxunk . xxmaj if you 're xxunk in finding it , fire me a xxunk and i 'll see if i can get you the xxunk . xxunk

คำว่า Late คือคำที่ 451 ใน vocab Dictionary

In [98]:
i = movie_reviews.vocab.stoi['late']; i

451

ดูใน Validation Term-Document Matrix ที่ Row 1, Column 451 จะได้ 2

In [100]:
val_term_doc[1, 451]

2

มีทั้งหมด 144 Token จาก 81 คำศัพท์

In [102]:
val_term_doc[1].sum(), (val_term_doc[1] > 0).sum()

(144, 81)

Token ที่ Numberize แล้ว

In [116]:
review.data

array([  2,  19, 248,  21, ...,   9,   0,  10,   0])

แปลง Token ที่ Numberize แล้ว กลับเป็น Token ข้อความ

In [117]:
[movie_reviews.vocab.itos[i] for i in review.data][:20]

['xxbos',
 'i',
 'saw',
 'this',
 'movie',
 'once',
 'as',
 'a',
 'kid',
 'on',
 'the',
 'late',
 '-',
 'late',
 'show',
 'and',
 'fell',
 'in',
 'love',
 'with']

## 4.5 Unknown Words

จำนวน itos ไม่เท่ากับ stoi เนื่องจาก หลายค่ำกลายเป็น Unknown เพื่อลดขนาด vocab Dictionary

In [118]:
len(movie_reviews.vocab.itos), len(movie_reviews.vocab.stoi)

(6008, 19163)

ต่างกันถึง 13155 คำ

In [119]:
len(movie_reviews.vocab.stoi) - len(movie_reviews.vocab.itos)

13155

ดูรายการ Unknown Word ทั้งหมด

In [0]:
# for k, v in movie_reviews.vocab.stoi.items():
#     print(k)

In [134]:
unk = [k for k, v in movie_reviews.vocab.stoi.items() if v == 0]
len(unk), unk[:20]

(13158,
 ['xxunk',
  'bleeping',
  'pert',
  'ticky',
  'schtick',
  'whoosh',
  'banzai',
  'chill',
  'wooofff',
  'cheery',
  'superstars',
  'fashionable',
  'cruelly',
  'separating',
  'mistreat',
  'tensions',
  'religions',
  'baseness',
  'nobility',
  'puro'])

# 5. Sentiment Classification

## 5.1 Naive Bayes


We define the log-count ratio $r$ for each word $f$:

$r = \log \frac{\text{ratio of feature $f$ in positive documents}}{\text{ratio of feature $f$ in negative documents}}$

where ratio of feature $f$ in positive documents is the number of times a positive document has a feature divided by the number of positive documents.

In [138]:
movie_reviews.y.classes

['negative', 'positive']

In [0]:
x = trn_term_doc
y = movie_reviews.train.y
val_y = movie_reviews.valid.y

In [141]:
positive = y.c2i['positive']
negative = y.c2i['negative']

positive, negative

(1, 0)

0