In [1]:
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential

[nltk_data] Downloading package punkt to /home/benjamin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-07-09 09:30:29.669964: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv(
    '/home/benjamin/Documents/OpenClassroomsDatasets/sentiment/sentiment140/training.1600000.processed.noemoticon.csv',
    encoding = "ISO-8859-1",
    names=["target", "id", "date", "flag", "user", "text"]
)

data = data.drop(columns=["id", "date", "flag", "user"])

data.target = data.target.map(
    {
        0: 0.0,
        2: 0.0,
        4: 1.0,
    }
)

def sample_equal_classes(df, n_pos=100000, n_neg=100000):
    df_pos = df[df["target"] == 1.0].sample(n=n_pos)
    df_neg = df[df["target"] == 0.0].sample(n=n_neg)
    
    return pd.concat([df_pos, df_neg]).reset_index(drop=True)
sampled_df = sample_equal_classes(data)

In [3]:
def tweeter(sentence):
    stemmer = PorterStemmer()
    tk = TweetTokenizer(preserve_case=False, reduce_len=True)
    tok_sent = tk.tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    text = [stemmer.stem(word.lower()) 
            for word in tok_sent 
            if word not in stop_words
            and word.isalpha()==True]
    sent=""
    for word in text:
        sent+=word+" "
    return sent[:-1]

sampled_df["text"] = sampled_df["text"].apply(lambda x:tweeter(x))
sampled_df.head()

Unnamed: 0,target,text
0,1.0,got everyth set need wait hour minut
1,1.0,see juli saw palladium worcest like month ago ...
2,1.0,realli today better good realli hot much coole...
3,1.0,join lvatt fun gotta problem yet ph
4,1.0,watch lita wwe diva tribut omg favorit diva lo...


In [4]:
tf = TfidfVectorizer(min_df=0.001)

X = tf.fit_transform(sampled_df['text']).toarray()
y = sampled_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [5]:
from tensorflow.keras.preprocessing import sequence
print("init model")
model = Sequential()
print("initiated")
print("Adding SimpleRNN layer")
model.add(SimpleRNN(128, input_shape=(1, X_train.shape[2]), activation='relu'))
print("Added")
print("Adding Dense layer")
model.add(Dense(1, activation='sigmoid')) 
print("Added")
print("compiling")
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("Compiled")

print("Training")
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))

# Save the model and vectorizer
model.save('model/rnn_model.h5')

init model
initiated
Adding SimpleRNN layer


2024-07-09 09:52:01.032376: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Added
Adding Dense layer
Added
compiling
Compiled
Training
Epoch 1/10
    1/40000 [..............................] - ETA: 9:59:05 - loss: 0.6752 - accuracy: 1.0000

2024-07-09 09:52:02.857868: W tensorflow/core/grappler/utils/graph_view.cc:849] No registered '' OpKernel for CPU devices compatible with node {{node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu}}
	.  Registered:  <no registered kernels>

2024-07-09 09:52:02.879297: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-toposort,tfg-shape-inference{graph-version=0},tfg-prepare-attrs-export)} failed: INVALID_ARGUMENT: Node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu has an empty op name
	when importing GraphDef to MLIR module in GrapplerHook
2024-07-09 09:52:02.902166: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional{force-control-capture=true},tfg-lift-legacy-call,symbol-privatize{},symbol-dce,tfg-prepare-attrs-export)} failed:



2024-07-09 09:55:57.089218: W tensorflow/core/grappler/utils/graph_view.cc:849] No registered '' OpKernel for CPU devices compatible with node {{node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu}}
	.  Registered:  <no registered kernels>

2024-07-09 09:55:57.095764: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-toposort,tfg-shape-inference{graph-version=0},tfg-prepare-attrs-export)} failed: INVALID_ARGUMENT: Node sequential/simple_rnn/while/body/_1/sequential/simple_rnn/while/simple_rnn_cell/Relu has an empty op name
	when importing GraphDef to MLIR module in GrapplerHook
2024-07-09 09:55:57.103832: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional{force-control-capture=true},tfg-lift-legacy-call,symbol-privatize{},symbol-dce,tfg-prepare-attrs-export)} failed:

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
model.save('model/rnn_model.h5')

In [8]:
def make_decision(predictions, threshold=0.5):
    label=[]
    for prediction in predictions:
        label.append([1]) if prediction > threshold else label.append([0])
    return label

In [9]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = model.predict(X_test)
decisions = make_decision(y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, decisions))
print("Confusion Matrix:\n", confusion_matrix(y_test, decisions))
print("Classification Report:\n", classification_report(y_test, decisions))

Accuracy: 0.74395
Confusion Matrix:
 [[14644  5363]
 [ 4879 15114]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.73      0.74     20007
         1.0       0.74      0.76      0.75     19993

    accuracy                           0.74     40000
   macro avg       0.74      0.74      0.74     40000
weighted avg       0.74      0.74      0.74     40000

