# Neural Network Basics

* Feedforward neural network
* Training neural networks effectively

In [None]:
#%tensorflow_version 1.x

import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import Image, display_png
from gensim.models import word2vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, Dropout, Flatten, GlobalAveragePooling1D
from keras.utils import to_categorical, plot_model, pad_sequences
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Feedforward Neural Network

## 1. Load up data

In [None]:
!gdown --id 14l7wuSNFg0KEberTf-LoniK1e4k2bQMa  # wongnai-data.zip
!unzip -o wongnai-data.zip

Downloading...
From: https://drive.google.com/uc?id=14l7wuSNFg0KEberTf-LoniK1e4k2bQMa
To: /content/wongnai-data.zip
100% 15.1M/15.1M [00:00<00:00, 306MB/s]
Archive:  wongnai-data.zip
replace wongnai-dev.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: wongnai-dev.csv         
  inflating: wongnai-test.csv        
  inflating: wongnai-train.csv       


In [None]:
# load data
train = pd.read_csv('wongnai-train.csv', encoding='utf-8')
dev = pd.read_csv('wongnai-dev.csv', encoding='utf-8')

# show data
train

Unnamed: 0,star,tokenized
0,5,มา|ตามคำแนะนำ|จาก|เพจ| |kin| |dee| |by| |praew...
1,5,บรรยากาศ|ร้าน|น่า|นั่ง|สไตล์|ญี่ปุ่น| |อาหาร|อ...
2,5,หน้า|ร้าน|เล็ก|ๆ| |แต่|เข้าไป|มี|พื้นที่|พอสมค...
3,4,It|'|s| |really| |good| |that| |After| |You| |...
4,3,มา|ทาน|ร้าน|นี้|ตั้งแต่|ร้าน|ยัง|ไม่|มี|ห้อง|แ...
...,...,...
23995,3,สาขา|ฟิวเจอร์|รังสิต| |วันอาทิตย์|คน|เยอะ|มาก|...
23996,4,ติ๋ม|ซำ|หลากหลาย|หน้า| |ชิ้น|ค่อนข้าง|เล็ก| |ร...
23997,5,ขับ|มา|ระหว่างทาง|จาก|ระยอง|จะ|ไป|จังหวัด|จันท...
23998,4,ร้าน|นี้|ตั้งอยู่|ริมถนน|พระราม| |3| |จะ|อยู่...


In [None]:
# add column "length"
train['length'] = train['tokenized'].apply(lambda x: x.count('|'))
dev['length'] = dev['tokenized'].apply(lambda x: x.count('|'))
dev

Unnamed: 0,star,tokenized,length
0,4,ร้าน|เล็ก|ๆ|ตกแต่ง|น่ารัก|สไตล์|ญี่ปุ่น| |อยู่...,65
1,4,แวะ|ไป|ชม|งาน|ครบรอบ| |150| |ปี| |สมเด็จ|พระพั...,466
2,3,ร้าน|ข้าวหมูแดง|สี|มรกต|เจ้าเก่า|จาก|เยาวราช| ...,231
3,3,ร้าน| |Food| |Truck| |อาหาร|ญี่ปุ่น|ชื่อดัง| |...,123
4,3,ร้าน|นี้|เส้น|เขา|อร่อย|มาก|เลย|คะ| |นุ่ม|หนึบ...,236
...,...,...,...
7995,2,วันนี้|ก็|เหมือนเคย| |กะ|ไป|ทานข้าว|เจ้าประจำ|...,151
7996,3,รีวิว|อัน|แรก|ก็|ต้อง|ด่า|ซะ|แล้ว| |\n|สำหรับ|...,175
7997,4,ร้าน|laduree| |ใน|london| |มี|ด้วยกัน|2| |สาขา...,696
7998,3,มื้อ|กลางวัน|วันนี้| |เรา|แวะ|มา|ทาน|ที่| |Je...,153


ส่วนใหญ่มีแค่ 100 คำ ไม่ต้องใช้ทั้งหมด

In [None]:
train.length.describe()

count    24000.000000
mean       161.704833
std        165.658685
min          7.000000
25%         74.000000
50%        110.000000
75%        183.000000
max       5486.000000
Name: length, dtype: float64

In [None]:
train.length.apply(lambda x: x < 1000).value_counts(normalize=True)

True     0.99425
False    0.00575
Name: length, dtype: float64

In [None]:
dev.length.describe()

count    8000.000000
mean      161.084125
std       156.850654
min        14.000000
25%        74.000000
50%       111.000000
75%       184.000000
max      2364.000000
Name: length, dtype: float64

## 2. Load up the pre-trained word embeddings

In [None]:
!gdown --id 14bv_aTSP-8rs_Bkudvpp8zcU3UpyRen6

Downloading...
From: https://drive.google.com/uc?id=14bv_aTSP-8rs_Bkudvpp8zcU3UpyRen6
To: /content/TNC_embeddings-100.bin
100% 25.9M/25.9M [00:00<00:00, 201MB/s]


In [None]:
# 50-unit version
#!gdown 1ZnBqhQcb4u_-OTkGX8QX0BweeJDzGa27
# 200-unit version
#!gdown 1771yA8EZro3pYM-4cUuTUgCqowDTSH_A

In [None]:
w2v_model = KeyedVectors.load_word2vec_format('TNC_embeddings-100.bin',
                                              binary=True, unicode_errors='ignore')



In [None]:
# vocabulary size of pre-trained model
vocab_size = len(w2v_model.vocab)
print('vocab size:', vocab_size)

# vector dimension
vector_dim = (len(w2v_model['ไป']))
print('vector dimension:', vector_dim)



vocab size: 61658
vector dimension: 100


In [None]:

# make weight matrix of word embedding, vocab size + 2 (for padding and unknown)
embedding_matrix = np.zeros((vocab_size+2, vector_dim), dtype="float32")
embedding_matrix[0] = np.zeros(vector_dim)

word_to_index = {word:i+1 for i, word in enumerate(w2v_model.vocab)}
# word to index dictionary, 0 for padding, UNKNOWN
word_to_index['PADDING'] = 0
word_to_index['<UNK>'] = len(word_to_index)

for i, word in enumerate(w2v_model.vocab):
    embedding_matrix[i+1] = w2v_model[word]

In [None]:
# example
word_to_index['จะ']

8

## 3. Convert words into indices and pad + truncate sequences

## 4. Mapping labels

In [None]:
def get_label(df):
  star_to_label = {1:0, 2:0, 3:1, 4:2, 5:2}
  # apply functions & convert to np.array
  label = np.array(df['star'].replace(star_to_label).tolist())
  df['label'] = label
  return to_categorical(label, num_classes=3)

# label : one-hot vector
train_y = get_label(train)
dev_y = get_label(dev)

In [None]:
train_y[0:10]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

## 5. Train the model

In [None]:
embedding_matrix.shape

(61660, 100)

## 6. Evaluate the model