## Import Libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
#from model_persistance import ModelPersistance
from joblib import dump, load
#from evaluate_classification import EvaluateBinaryClassification

## Initialise Random variables and Tensor Board

In [2]:
SEED = 101
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
df_test = pd.read_csv('data_test_clean.csv')
df_train = pd.read_csv('data_train_clean.csv')

In [4]:
X_train, y_train = df_train['Tweet_Parsed'].values, df_train['HS'].values

In [5]:
X_test, y_test = df_test['Tweet_Parsed'].values, df_test['HS'].values

## Transforming data suitable for model format

In [6]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
num_words = 300000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

## Loading word embedding and mapping data to that word embedding

In [7]:
# from gensim.models import KeyedVectors
# W2V_BASE = FastText.load_fasttext_format('../Test Feature Expansion/fasttext/cc.id.300.bin')
# model_ug_cbow = KeyedVectors.load(W2V_BASE+'vectors.txt')

# embeddings_index = {}
# for w in model_ug_cbow.wv.vocab.keys():
#     embeddings_index[w] = model_ug_cbow.wv[w]

# embedding_matrix = np.zeros((num_words, 200))
# for word, i in tokenizer.word_index.items():
#     if i >= num_words:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [8]:
from gensim.models import KeyedVectors
from gensim.models import FastText
model_path = '../Test Feature Expansion/fasttext/cc.id.300.bin'
model = FastText.load_fasttext_format(model_path)

# Load tokenizer dan tentukan num_words sesuai kebutuhan Anda
# tokenizer = ...

# Inisialisasi embedding_matrix
embedding_matrix = np.zeros((num_words, 300))

for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = model.wv[word] if word in model.wv else None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  model = FastText.load_fasttext_format(model_path)


## Creating CNN model and training it for 10 epoc

In [15]:
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Input, concatenate, Activation
from keras.models import Model

def create_cnn_model():
    tweet_input = Input(shape=(maxlen,), dtype='int64')
    
    print('loading word vectors')
    #tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Embedding(num_words, 300, input_length=maxlen, trainable=True)(tweet_input)    
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    
    bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    bigram_branch = Dropout(0.5)(bigram_branch)
    
    trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    trigram_branch = Dropout(0.2)(trigram_branch)
    
    fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    fourgram_branch = Dropout(0.2)(fourgram_branch)
    
    #merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)
    merged = concatenate([trigram_branch, fourgram_branch], axis=1)
    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    
    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)
    
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

cnn_model = create_cnn_model()
cnn_model.fit(xtrain, y_train, epochs=4, batch_size=32, verbose=1)

loading word vectors
Epoch 1/4


TypeError: in user code:

    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\losses.py", line 160, in __call__
        return losses_utils.compute_weighted_loss(
    File "C:\Users\gilan\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\utils\losses_utils.py", line 328, in compute_weighted_loss
        losses = tf.convert_to_tensor(losses)

    TypeError: Failed to convert elements of <keras.losses.SparseCategoricalCrossentropy object at 0x0000021A4E2BFB10> to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


## Evaluating the model with test dataset

In [11]:
# p = lstm_model.predict(xtest,verbose=1)
# predicted = [int(round(x[0])) for x in p]
# actual = y_test

# ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
# print(ebc.get_full_report())

In [12]:
from sklearn.metrics import classification_report
p = cnn_model.predict(xtest,verbose=1)
predicted = [int(round(x[0])) for x in p]
actual = y_test

print('\nClassification Report\n')
print(classification_report(actual, predicted, target_names=['0','1']))


Classification Report

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      2204
           1       0.75      0.74      0.74      1476

    accuracy                           0.80      3680
   macro avg       0.79      0.79      0.79      3680
weighted avg       0.80      0.80      0.80      3680

