In [None]:
!pip install deep-translator

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator
from googletrans import Translator
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import random
from tensorflow.keras import layers
from keras.layers import InputLayer, MaxPooling2D, Flatten, Dense, Conv2D, Dropout, BatchNormalization
import tensorflow as tf
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
% cd /content/gdrive/My Drive/Final Project

In [None]:
train=pd.read_csv('training.csv')
test=pd.read_csv('test.csv')
valid=pd.read_csv('validation.csv')

In [None]:
train['text'][0]

In [None]:
idx=train.index

In [None]:
test.info()

In [None]:
valid.info()

In [None]:
train.info()

In [None]:
train2=train.sample(frac=1,random_state=913)
train2=train2.reset_index(drop=True)
train2

In [None]:
#define pipeline translator
indo=GoogleTranslator(result='auto',target='id')

In [None]:
idx=train2.index

In [None]:
hasil=[]
for i in idx:
  indos=indo.translate(train2['text'][i])
  hasil.append(indos)

In [None]:
train2['indo']=hasil

In [None]:
train_indo=train2[['indo','label']]

In [None]:
train_indo.sample(10)

# Data Preprocessing

In [None]:
train_indo=pd.read_csv('data_1.csv')

In [None]:
labels=pd.read_csv('train_shuffled.csv')
train_indo['label']=labels['label']
train_indo=train_indo.rename(columns={'0':'original'})
train_indo.head()

In [None]:
train_indo['original']=train_indo['original'].astype('object')

In [None]:
#lowering
train_indo['original']=train_indo['original'].apply(lambda x: x.lower())

In [None]:
idx2=train_indo.index
idx2

In [None]:
#remove numbers and url
for i in idx2:
  train_indo['original'][i] = re.sub(r'\d+', '', train_indo['original'][i])
  train_indo['original'][i] = re.sub(r'http\S+', '', train_indo['original'][i])

In [None]:
#remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
train_indo['original']= train_indo['original'].apply(lambda x:remove_punctuation(x))

In [None]:
sentences=train_indo['original']

In [None]:
stop_words=set(stopwords.words('indonesian'))

In [None]:
#tokenization
train_indo['indo_token']=train_indo['original']
for i in idx2:
  train_indo['indo_token'][i]=word_tokenize(train_indo['original'][i])

In [None]:
train_indo

In [None]:
#remove stop words
stop_words=set(stopwords.words('indonesian'))
def remove_stopwords(text):
    output= [i for i in text if i not in stop_words]
    return output

train_indo['indo_stop']= train_indo['indo_token'].apply(lambda x:remove_stopwords(x))

In [None]:
train_indo

In [None]:
sentence = 'He said "hey Bill!"'
word_tokenize(sentence)

In [None]:
train_indo.to_csv('16k_data.csv')

In [None]:
data_train=pd.read_csv('16k_data.csv')

In [None]:
data_train

# Modelling

In [None]:
y=train_indo['label']

In [None]:
train_set, temp_df = train_test_split(train_indo, test_size=0.15,random_state=123,stratify=y)

In [None]:
y2=temp_df['label']

In [None]:
test_set, valid_set = train_test_split(temp_df, test_size=0.5,random_state=123,stratify=y2)

In [None]:
train_df, test_df = train_test_split(data_train, test_size=0.05,random_state=123,stratify=y)

In [None]:
X_train, y_train = train_df.indo_stop, train_df.label
X_test, y_test = test_df.indo_stop, test_df.label

def prepare_target(y_train, y_test):
  y_train_enc = to_categorical(y_train)
  y_test_enc = to_categorical(y_test)
  return y_train_enc, y_test_enc

y_train_array, y_test_array = prepare_target(y_train, y_test)


In [None]:
train_sent, val_sent, train_labels, val_labels = train_test_split(X_train.to_numpy(),
                                                                   y_train_array,
                                                                   test_size=0.2)

X_train, y_train = train_df.indo_stop, train_df.label
X_test, y_test = test_set.indo_stop, test_set.label
X_valid, y_valid = valid_set.indo_stop, valid_set.label

def prepare_target(y_train, y_test,y_valid):
  y_train_enc = to_categorical(y_train)
  y_test_enc = to_categorical(y_test)
  y_valid_enc = to_categorical(y_valid)
  return y_train_enc, y_test_enc,y_valid_enc

y_train_array, y_test_array,y_valid_array = prepare_target(y_train, y_test,y_valid)


In [None]:
train_sent[:10], train_labels[:10]

In [None]:
max_vocab_length = 4000
max_length = 20

text_vectorization = TextVectorization(max_tokens=max_vocab_length,
                                       standardize="lower_and_strip_punctuation",
                                       split="whitespace",
                                       ngrams=None,
                                       output_mode='int',
                                       output_sequence_length=max_length)

In [None]:
# vektorisasi teks
text_vectorization.adapt(train_sent)

In [None]:
# sms baru
sample = "aku merasa bahagia, pintar, cerdas, senang"
text_vectorization([sample])

In [None]:
text_vectorization.get_vocabulary()[78]

In [None]:
random_text = random.choice(X_train)
print(f"Teks Asli : {random_text}\n")
text_vectorization([random_text])

# Create Model

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=max_length)

embedding

In [None]:
random_text = random.choice(train_sent)
print(f"Teks Asli : {random_text}\n")
text_vectorization([random_text])

sample_embedded = embedding(text_vectorization([random_text]))
sample_embedded.shape

In [None]:
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorization(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
hidden_layer1=Dense(500,activation='relu',name='hl1',kernel_initializer=initializers.GlorotNormal(seed=123))(x)
batch1=BatchNormalization(axis=1)(hidden_layer1) 
hidden_layer2=Dense(250,activation='relu',name='hl2',kernel_initializer=initializers.GlorotNormal(seed=123))(batch1)
batch2=BatchNormalization(axis=1)(hidden_layer2) 
hidden_layer3=Dense(125,activation='relu',name='hl3',kernel_initializer=initializers.GlorotNormal(seed=123))(batch2)
batch3=BatchNormalization(axis=1)(hidden_layer3) 
hidden_layer4=Dense(63,activation='relu',name='hl4',kernel_initializer=initializers.GlorotNormal(seed=123))(batch3)
batch4=BatchNormalization(axis=1)(hidden_layer4) 
hidden_layer5=Dense(32,activation='relu',name='hl5',kernel_initializer=initializers.GlorotNormal(seed=123))(batch4)
batch5=BatchNormalization(axis=1)(hidden_layer5) 
outputs = layers.Dense(6, activation='softmax')(batch5)
model_1 = tf.keras.Model(inputs, outputs, name='model_nlp_fc')

In [None]:
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorization(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
x = Dropout(0.2)(x)
hidden_layer1=Dense(500,activation='relu',name='hl1',kernel_initializer=initializers.GlorotNormal(seed=123))(x)
batch1=BatchNormalization(axis=1)(hidden_layer1) 
hidden_layer2=Dense(400,activation='relu',name='hl2',kernel_initializer=initializers.GlorotNormal(seed=123))(batch1)
batch2=BatchNormalization(axis=1)(hidden_layer2) 
hidden_layer3=Dense(300,activation='relu',name='hl3',kernel_initializer=initializers.GlorotNormal(seed=123))(batch2)
batch3=BatchNormalization(axis=1)(hidden_layer3) 
hidden_layer4=Dense(200,activation='relu',name='hl4',kernel_initializer=initializers.GlorotNormal(seed=123))(batch3)
batch4=BatchNormalization(axis=1)(hidden_layer4) 
hidden_layer5=Dense(100,activation='relu',name='hl5',kernel_initializer=initializers.GlorotNormal(seed=123))(batch4)
batch5=BatchNormalization(axis=1)(hidden_layer5) 
hidden_layer6=Dense(60,activation='relu',name='hl6',kernel_initializer=initializers.GlorotNormal(seed=123))(batch5)
batch6=BatchNormalization(axis=1)(hidden_layer6) 
hidden_layer7=Dense(30,activation='relu',name='hl7',kernel_initializer=initializers.GlorotNormal(seed=123))(batch6)
batch7=BatchNormalization(axis=1)(hidden_layer7) 
hidden_layer8=Dense(10,activation='relu',name='hl8',kernel_initializer=initializers.GlorotNormal(seed=123))(batch7)
batch8=BatchNormalization(axis=1)(hidden_layer8) 
outputs = layers.Dense(6, activation='softmax')(batch8)
model_lstm = tf.keras.Model(inputs, outputs, name='model_nlp_fc')

In [None]:
model_1.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy",'Recall','Precision'])

In [None]:
model_lstm.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy",'Recall','Precision'])

In [None]:
model_1.summary()

model_1_hist = model_1.fit(train_sent,
                           train_labels,
                           epochs=50,
                           validation_data=(val_sent, val_labels,verbose=1))

In [None]:
model_lstm_hist = model_lstm.fit(train_sent,
                           train_labels,
                           epochs=50,
                           validation_data=(val_sent, val_labels))

In [None]:
results_lstm=model_lstm.predict(X_test)
result_class_lstm=results_lstm.argmax(axis=1)
print(classification_report(y_test, result_class_lstm))

In [None]:
# visualize the chart

pd.DataFrame(list(zip(model_lstm_hist.history['accuracy'],model_lstm_hist.history['val_accuracy'])),
               columns =['accuracy', 'val_accuracy']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)

pd.DataFrame(list(zip(model_lstm_hist.history['loss'],model_lstm_hist.history['val_loss'])),
               columns =['loss', 'val_loss']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)

pd.DataFrame(list(zip(model_lstm_hist.history['recall'],model_lstm_hist.history['val_recall'])),
               columns =['recall', 'val_recall']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

In [None]:
tf.keras.utils.plot_model(model_1, show_shapes=True, rankdir="LR")

In [None]:
results=model_1.predict(X_test)

In [None]:
result_class=results.argmax(axis=1)

In [None]:
print(classification_report(y_test, result_class))

In [None]:
# visualize the chart

pd.DataFrame(list(zip(model_1_hist.history['accuracy'],model_1_hist.history['val_accuracy'])),
               columns =['accuracy', 'val_accuracy']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)

pd.DataFrame(list(zip(model_1_hist.history['loss'],model_1_hist.history['val_loss'])),
               columns =['loss', 'val_loss']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)

pd.DataFrame(list(zip(model_1_hist.history['recall'],model_1_hist.history['val_recall'])),
               columns =['recall', 'val_recall']).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()