In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

2024-07-08 12:26:23.230544: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/benjamin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv(
    '/home/benjamin/Documents/OpenClassroomsDatasets/sentiment/sentiment140/training.1600000.processed.noemoticon.csv',
    encoding = "ISO-8859-1",
    names=["target", "id", "date", "flag", "user", "text"]
)
data = data.drop(columns=["id", "date", "flag", "user"])

data.target = data.target.map(
    {
        0: 0.0, #Negative
        2: 0.0, #Neutral
        4: 1.0, #Positive
    }
)

def sample_equal_classes(df, n_pos=100000, n_neg=100000):
    df_pos = df[df["target"] == 1.0].sample(n=n_pos)
    df_neg = df[df["target"] == 0.0].sample(n=n_neg)
    
    return pd.concat([df_pos, df_neg]).reset_index(drop=True)
sampled_df = sample_equal_classes(data)

In [3]:
def tweeter(sentence):
    stemmer = PorterStemmer()
    tk = TweetTokenizer(preserve_case=False, reduce_len=True)
    tok_sent = tk.tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    text = [stemmer.stem(word.lower()) 
            for word in tok_sent 
            if word not in stop_words
            and word.isalpha()==True]
    sent=""
    for word in text:
        sent+=word+" "
    return sent[:-1]

sampled_df["text"] = sampled_df["text"].apply(lambda x:tweeter(x))
sampled_df.head()

Unnamed: 0,target,text
0,1.0,bet ur missus
1,1.0,worri thing life dont pretend person
2,1.0,cant wait til sister get hous dead wii
3,1.0,hmm one seem get wear flatter way
4,1.0,use iphon


In [4]:
vectorizer = TfidfVectorizer(min_df=0.01)
X = vectorizer.fit_transform(sampled_df["text"]).toarray()
y = sampled_df["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train)
print(y_train)

# Create a simple neural network model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))

# Save the model and vectorizer
model.save('model/neural_network_model.h5')
pickle_out = open("model/tfidf_vectorizer.pkl","wb")
pickle.dump(vectorizer, pickle_out)
pickle_out.close()

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.44081815 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
153248    0.0
67802     1.0
148889    0.0
103093    0.0
104681    0.0
         ... 
119879    0.0
103694    0.0
131932    0.0
146867    0.0
121958    0.0
Name: target, Length: 160000, dtype: float64
Epoch 1/10


2024-07-08 12:27:30.564723: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
def make_decision(predictions, threshold=0.5):
    label=[]
    for prediction in predictions:
        label.append([1]) if prediction > threshold else label.append([0])
    return label

In [12]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = model.predict(X_test)

decisions = make_decision(y_pred)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, decisions))
print("Confusion Matrix:\n", confusion_matrix(y_test, decisions))
print("Classification Report:\n", classification_report(y_test, decisions))

Accuracy: 0.6579
Confusion Matrix:
 [[10948  9059]
 [ 4625 15368]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.55      0.62     20007
         1.0       0.63      0.77      0.69     19993

    accuracy                           0.66     40000
   macro avg       0.67      0.66      0.65     40000
weighted avg       0.67      0.66      0.65     40000

