In [2]:
#import data
import pandas as pd
df = pd.read_csv('yelp_labelled.txt', names=['sentences','label'], sep='\t')

In [3]:
df.head(5)

Unnamed: 0,sentences,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [21]:
# split data training dan testing
from sklearn.model_selection import train_test_split
kalimat = df['sentences'].values
y = df['label'].values

x_train, x_test, y_train, y_test = train_test_split(kalimat, y, test_size=0.2)


In [22]:
# import tokenizer dan padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
#tokenisasi dan padding text
#tokenisasi
tokenizer = Tokenizer(num_words=250, oov_token='x')
tokenizer.fit_on_texts(x_train)
tokenizer.fit_on_texts(x_test)

#membuat sequence
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

#menyamakan panjang sequences melalui proses padding
train_padded = pad_sequences(train_sequences)
test_padded = pad_sequences(test_sequences)

In [24]:
#kita membutuhkan layer embedding untuk mengubah padded sequence menjadi vektor,
# kemudian vektor vektor tersebut akan dihitung kemiripannya.
#Embedding ialah representasi suatu objek menjadi vektor yang kontinyu, objek tersebut bisa kalimat,
# film, manusia, dll tidak terbatas pada text.
import tensorflow as tf

model = tf.keras.models.Sequential([
    #tf.keras.layers.Embedding(jumlah_kata, dimensi embedding, panjang_input)
    tf.keras.layers.Embedding(250, 16, input_length=20),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [25]:
history = model.fit(train_padded, y_train, validation_data= (test_padded, y_test), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
# kita bisa menggunakan LSTM (long short term memory) untuk bisa mengklasifikasikan kalimat dengan lebih baik
#LSTM memperhatikan urutan dari masing masing kata dalam kalimat, dan memproses kata tersebut sehingga bisa memprediksi sebenarnya kalimat tersebut kelas apa

model2 = tf.keras.models.Sequential([
    #tf.keras.layers.Embedding(jumlah_kata, dimensi embedding, panjang_input)
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model2.compile(optimizer=tf.keras.optimizers.Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [30]:
history = model2.fit(train_padded, y_train, validation_data= (test_padded, y_test), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
