In [1]:
import os
import random
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, Flatten, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lol19\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lol19\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lol19\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Global settings
def estimate_accuracy(X_test, y_test, y_pred):
    loss, accuracy = model.evaluate(X_test, y_test, verbose = 0)
    print(f"\nTest loss:          {loss}")
    print(f"Test accuracy:      {accuracy}\n")

    # Convert probabilities to class labels
    y_pred_labels = np.argmax(y_pred, axis = 1)
    print(classification_report(y_test, y_pred_labels))
    print(confusion_matrix(y_test, y_pred_labels))


random_state = 2291
pd.set_option("display.max_colwidth", 200)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Disable warnings
random.seed(random_state)
np.random.seed(random_state)
tf.random.set_seed(random_state)

In [3]:
# RECURRENT NEURAL NETWORK (RNN)
# Load dataset:     https://www.kaggle.com/datasets/lakshmi25npathi/images
file_dataset = "input/Youtube02-KatyPerry.csv"
data = pd.read_csv(file_dataset, encoding = "latin-1")
data = data.drop("COMMENT_ID", axis = 1)
data = data.drop("AUTHOR", axis = 1)
data = data.drop("DATE", axis = 1)
feature_names = {
    "CONTENT": "Content",
    "CLASS": "Category"
}
data = data.rename(columns = feature_names)
data.info()
display(data)
unique_vals = data["Category"].unique()
print(unique_vals)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Content   350 non-null    object
 1   Category  350 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.6+ KB


Unnamed: 0,Content,Category
0,i love this so much. AND also I Generate Free Leads on Auto Pilot &amp; You Can Too! http://www.MyLeaderGate.com/moretrafficï»¿,1
1,http://www.billboard.com/articles/columns/pop-shop/6174122/fan-army-face-off-round-3 Vote for SONES please....we're against vips....please help us.. &gt;.&lt;ï»¿,1
2,Hey guys! Please join me in my fight to help abused/mistreated animals! All fund will go to helping pay for vet bills/and or helping them find homes! I will place an extra emphasis on helping di...,1
3,http://psnboss.com/?ref=2tGgp3pV6L this is the songï»¿,1
4,Hey everyone. Watch this trailer!!!!!!!! http://believemefilm.com?hlr=h2hQBUVBï»¿,1
...,...,...
345,This song means so much to me thank you soooooooooooooooooooooooooooooooooooooooo much:-) Xxxï»¿,0
346,&lt;3ï»¿,0
347,"KATY PERRY, I AM THE ""DÃCIO CABELO"", ""DECIO HAIR"". I AM 60 YEARS OF AGE. I DON""T HAVE FAMILY. I""M SINGLE. ALONE. HOMELESS. I WAS AN ALCOHOLIC: 15 AT THE AGE OF 46. I AM AN INVISIBLE COMPOSER. M...",1
348,Honestly speaking except taylor swift and adele i don't lile any of the modern day singers. But i must say whenever i hear this song i feel goosebumps. Its quite inspiring!! Thanks miss Perry!ï»¿,0


[1 0]


In [4]:
# Preprocess texts
def clean_text(text):
    tokens = word_tokenize(text)  # Split text into tokens

    # Remove punctuation, convert to lower case, clean stop words
    stop_words = stopwords.words("english")
    words = []
    for token in tokens:
        if token.isalpha():
            word = token.lower()
            if word not in stop_words:
                words.append(word)

    # Perform stemming
    stemmer = PorterStemmer()
    stemmed_words = []
    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_words.append(stemmed_word)
    stemmed_text = " ".join(stemmed_words)

    return stemmed_text


data_before = data.copy()
data["Content"] = data["Content"].apply(clean_text)
max_length = data["Content"].apply(len).max()
display(data_before.head(6))
display(data.head(6))

Unnamed: 0,Content,Category
0,i love this so much. AND also I Generate Free Leads on Auto Pilot &amp; You Can Too! http://www.MyLeaderGate.com/moretrafficï»¿,1
1,http://www.billboard.com/articles/columns/pop-shop/6174122/fan-army-face-off-round-3 Vote for SONES please....we're against vips....please help us.. &gt;.&lt;ï»¿,1
2,Hey guys! Please join me in my fight to help abused/mistreated animals! All fund will go to helping pay for vet bills/and or helping them find homes! I will place an extra emphasis on helping di...,1
3,http://psnboss.com/?ref=2tGgp3pV6L this is the songï»¿,1
4,Hey everyone. Watch this trailer!!!!!!!! http://believemefilm.com?hlr=h2hQBUVBï»¿,1
5,check out my rapping hope you guys like it https://soundcloud.com/nereboy/call-of-the-lostproduce-by-atlastatlas-rapper-jkork and follow and message me :)ï»¿,1


Unnamed: 0,Content,Category
0,love much also gener free lead auto pilot amp http,1
1,http vote sone pleas vip pleas help us gt lt ï,1
2,hey guy pleas join fight help anim fund go help pay vet help find home place extra emphasi help disabl anim one otherwis would put sleep anim organ donat pleas http,1
3,http songï,1
4,hey everyon watch trailer http,1
5,check rap hope guy like http follow messag ï,1


In [5]:
# Split dataset into training set and test set
X = data["Content"]
y = data["Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts = X_train)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size:    {vocab_size}")
print(f"Max text length:    {max_length}\n")

Vocabulary size:    895
Max text length:    320



In [6]:
# Convert texts to sequences of indexes. Then add padding
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print("Text to integer sequences:\n"
      f"  (preprocessed) -> {preprocessed_text}\n"
      f"  (tokenized)    -> {X_train[0]}")
X_train = pad_sequences(X_train, maxlen = max_length)
X_test = pad_sequences(X_test, maxlen = max_length)

Text to integer sequences:
  (preprocessed) -> hey guy go check video name growtopia stori ï
  (tokenized)    -> [17, 13, 21, 11, 5, 185, 308, 309, 1]


In [7]:
# Create RNN model
model = Sequential(
    [
        Embedding(input_dim = vocab_size, input_length = max_length, output_dim = 32),
        LSTM(units = 64, return_sequences = True),
        Dropout(0.2),
        LSTM(units = 64),
        Dropout(0.3),
        Dense(units = len(unique_vals), activation = "softmax")
    ]
)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 320, 32)           28640     
                                                                 
 lstm (LSTM)                 (None, 320, 64)           24832     
                                                                 
 dropout (Dropout)           (None, 320, 64)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 86626 (338.38 KB)
Trainable params: 86626

In [8]:
# Compile and fit RNN
model.compile(
    optimizer = "adam",
    loss = "sparse_categorical_crossentropy",
    metrics = ["accuracy"]
)
model.fit(X_train, y_train, batch_size = 128, epochs = 35, validation_split = 0.1)
y_pred = model.predict(X_test)
estimate_accuracy(X_test, y_test, y_pred)


Epoch 1/35


Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35

Test loss:          0.2910350263118744
Test accuracy:      0.8714285492897034

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        40
           1       0.86      0.83      0.85        30

    accuracy                           0.87        70
   macro avg       0.87      0.87      0.87        70
weighted avg       0.87      0.87      0.87        70

[[36  4]
 [ 5 25]]
