## Import Libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
#from model_persistance import ModelPersistance
from joblib import dump, load
#from evaluate_classification import EvaluateBinaryClassification

## Initialise Random variables and Tensor Board

In [2]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
df_test = pd.read_csv('data_test_clean.csv')
df_train = pd.read_csv('data_train_clean.csv')

In [4]:
X_train, y_train = df_train['Tweet_Parsed'].values, df_train['HS'].values

In [5]:
X_test, y_test = df_test['Tweet_Parsed'].values, df_test['HS'].values

## Transforming data suitable for model format

In [6]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

## Loading word embedding and mapping data to that word embedding

In [7]:
# from gensim.models import KeyedVectors
# W2V_BASE = FastText.load_fasttext_format('../Test Feature Expansion/fasttext/cc.id.300.bin')
# model_ug_cbow = KeyedVectors.load(W2V_BASE+'vectors.txt')

# embeddings_index = {}
# for w in model_ug_cbow.wv.vocab.keys():
#     embeddings_index[w] = model_ug_cbow.wv[w]

# embedding_matrix = np.zeros((num_words, 200))
# for word, i in tokenizer.word_index.items():
#     if i >= num_words:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

NameError: name 'FastText' is not defined

In [8]:
from gensim.models import KeyedVectors
from gensim.models import FastText
model_path = '../Test Feature Expansion/fasttext/cc.id.300.bin'
model = FastText.load_fasttext_format(model_path)

# Load tokenizer dan tentukan num_words sesuai kebutuhan Anda
# tokenizer = ...

# Inisialisasi embedding_matrix
embedding_matrix = np.zeros((num_words, 300))

for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = model.wv[word] if word in model.wv else None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  model = FastText.load_fasttext_format(model_path)


## Creating LSTM model and training it for 10 epoc

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Input, Activation
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from keras.models import Model

def create_lstm_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')
    #tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Embedding(num_words, 300, input_length=maxlen)(tweet_input)
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    merged = LSTM(64)(tweet_encoder)
    merged = Dropout(0.5)(merged)
    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

lstm_model = create_lstm_model()
lstm_model.fit(xtrain, y_train, epochs=3, batch_size=32, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1e10afae450>

## Evaluating the model with test dataset

In [10]:
# p = lstm_model.predict(xtest,verbose=1)
# predicted = [int(round(x[0])) for x in p]
# actual = y_test

# ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
# print(ebc.get_full_report())

In [11]:
from sklearn.metrics import classification_report
p = lstm_model.predict(xtest,verbose=1)
predicted = [int(round(x[0])) for x in p]
actual = y_test

print('\nClassification Report\n')
print(classification_report(actual, predicted, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.83      0.84      0.83      2204
           1       0.75      0.75      0.75      1476

    accuracy                           0.80      3680
   macro avg       0.79      0.79      0.79      3680
weighted avg       0.80      0.80      0.80      3680

