Submission 1 Membuat Model NLP dengan TensorFlow

BBC Text Classification

# Download dataset

In [None]:
!wget --no-check-certificate \
  https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv \
  -O /tmp/bbc-text.csv

--2021-07-10 03:51:26--  https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.153.128, 74.125.128.128, 74.125.143.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.153.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/tmp/bbc-text.csv’


2021-07-10 03:51:26 (152 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



# Read dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/tmp/bbc-text.csv')

In [None]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

# One-hot encoding

In [None]:
category = pd.get_dummies(df.category)
df_new = pd.concat([df, category], axis=1)
df_new = df_new.drop(columns='category')
df_new

Unnamed: 0,text,business,entertainment,politics,sport,tech
0,tv future in the hands of viewers with home th...,0,0,0,0,1
1,worldcom boss left books alone former worldc...,1,0,0,0,0
2,tigers wary of farrell gamble leicester say ...,0,0,0,1,0
3,yeading face newcastle in fa cup premiership s...,0,0,0,1,0
4,ocean s twelve raids box office ocean s twelve...,0,1,0,0,0
...,...,...,...,...,...,...
2220,cars pull down us retail figures us retail sal...,1,0,0,0,0
2221,kilroy unveils immigration policy ex-chatshow ...,0,0,1,0,0
2222,rem announce new glasgow concert us band rem h...,0,1,0,0,0
2223,how political squabbles snowball it s become c...,0,0,1,0,0


# Data Preparation

In [None]:
# Converting to numpy array
text = df_new['text'].values
label = df_new[['business', 'entertainment', 'politics', 'sport', 'tech']].values

In [None]:
# Split the dataset, where validation set is equal to 0.2
from sklearn.model_selection import train_test_split
text_train, text_test, label_train, label_test = train_test_split(text, label, test_size=0.2)

In [None]:
# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token='-')
tokenizer.fit_on_texts(text_train) 
tokenizer.fit_on_texts(text_test)

In [None]:
# Sequence and Padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
sequence_train= tokenizer.texts_to_sequences(text_train)
sequence_test = tokenizer.texts_to_sequences(text_test)
 
padded_train = pad_sequences(sequence_train)
padded_test = pad_sequences(sequence_test)

# Building Model

In [None]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy')>0.9):
      print("\nAccuracy >90%!")
      self.model.stop_training = True
callbacks = myCallback()

In [22]:
history = model.fit(padded_train, label_train, epochs=50, callbacks=[callbacks],
                    validation_data=(padded_test, label_test), verbose=2)

Epoch 1/50
56/56 - 162s - loss: 1.6002 - accuracy: 0.2449 - val_loss: 1.5887 - val_accuracy: 0.2944
Epoch 2/50
56/56 - 160s - loss: 1.4061 - accuracy: 0.3978 - val_loss: 1.3405 - val_accuracy: 0.4157
Epoch 3/50
56/56 - 161s - loss: 1.0290 - accuracy: 0.5202 - val_loss: 0.9701 - val_accuracy: 0.6112
Epoch 4/50
56/56 - 160s - loss: 0.6596 - accuracy: 0.7247 - val_loss: 1.5267 - val_accuracy: 0.6202
Epoch 5/50
56/56 - 161s - loss: 0.5921 - accuracy: 0.7848 - val_loss: 0.8306 - val_accuracy: 0.7169
Epoch 6/50
56/56 - 162s - loss: 0.2090 - accuracy: 0.9315 - val_loss: 0.8569 - val_accuracy: 0.7551
Epoch 7/50
56/56 - 161s - loss: 0.1092 - accuracy: 0.9691 - val_loss: 0.9444 - val_accuracy: 0.7753
Epoch 8/50
56/56 - 162s - loss: 0.0423 - accuracy: 0.9860 - val_loss: 1.1677 - val_accuracy: 0.7528
Epoch 9/50
56/56 - 177s - loss: 0.0129 - accuracy: 0.9983 - val_loss: 1.0605 - val_accuracy: 0.7910
Epoch 10/50
56/56 - 164s - loss: 0.0049 - accuracy: 0.9989 - val_loss: 1.0733 - val_accuracy: 0.7843