In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [3]:
# Charger le dataset
dataset = pd.read_csv("C:/Users/flavi/Downloads/fifa_world_cup_2022_tweets.csv")

# print dataset info to see how the dataset is structured
print(dataset[['Sentiment']])

      Sentiment
0       neutral
1      positive
2      positive
3      positive
4      negative
...         ...
22519  positive
22520   neutral
22521  positive
22522  positive
22523   neutral

[22524 rows x 1 columns]


In [3]:
# # separate tweets and sentiments
# tweets = dataset['Tweet']
# labels = dataset['Sentiment']
# print(tweets)

0        What are we drinking today @TucanTribe \n@MadB...
1        Amazing @CanadaSoccerEN  #WorldCup2022 launch ...
2        Worth reading while watching #WorldCup2022 htt...
3        Golden Maknae shinning bright\n\nhttps://t.co/...
4        If the BBC cares so much about human rights, h...
                               ...                        
22519              Here We go World cup 2022 #WorldCup2022
22520    Anderlecht confirms former Viborg FF's Jesper ...
22521    Great thread to read before the start of #Worl...
22522    Raphinha wants Brazil to be united at the #Wor...
22523    How to buy $SOT on PinkSale?🤔\n\nHave you been...
Name: Tweet, Length: 22524, dtype: object


In [4]:
# Preprocessing

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to preprocess tweets
def preprocess_tweet(tweet):
    tweet = tweet.replace('\n', ' ')  # Replace newline characters with spaces
    tweet = tweet.replace('\r', ' ')  # Replace carriage returns with spaces
    tweet = tweet.replace('http', '')  # Remove URLs starting with 'http'
    tweet = tweet.replace('www', '')  # Remove URLs starting with 'www'
    tweet = tweet.replace('https', '')  # Remove URLs starting with 'https'
    tweet = tweet.replace('@', '')  # Remove '@' symbols
    tweet = tweet.replace('#', '')  # Remove '#' symbols
    tweet = ''.join([char for char in tweet if char.isalnum() or char.isspace()])  # Remove special characters
    tweet = tweet.lower()  # Convert to lowercase
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])  # Remove stopwords
    return tweet

tweets_df = dataset['Tweet'].apply(preprocess_tweet)
    

print(tweets_df)

0        drinking today tucantribe madbears lkincalgo a...
1        amazing canadasocceren worldcup2022 launch vid...
2        worth reading watching worldcup2022 stco1sqrna...
3        golden maknae shinning bright stco4ayzbzgtx4 j...
4        bbc cares much human rights homosexual rights ...
                               ...                        
22519                       go world cup 2022 worldcup2022
22520    anderlecht confirms former viborg ffs jesper f...
22521    great thread read start worldcup2022 stcovp62j...
22522    raphinha wants brazil united worldcup2022 stco...
22523    buy sot pinksale confused buy tokens pinksale ...
Name: Tweet, Length: 22524, dtype: object


In [5]:
# Tokenization

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets_df)
X = tokenizer.texts_to_sequences(tweets_df)

# Padding sequences
X = pad_sequences(X, maxlen=100)

# Convert sentiment labels to numerical values
dataset['Sentiment'] = dataset['Sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
y = dataset['Sentiment'].values

In [6]:
## Building the custom LSTM

class CustomLSTMModel:
    def __init__(self, vocab_size, embedding_dim, input_length, hidden_dim, output_size):
        self.model = Sequential()
        self.model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
        self.model.add(SpatialDropout1D(0.2))
        self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
        self.model.add(LSTM(hidden_dim, dropout=0.2, recurrent_dropout=0.2))
        self.model.add(Dense(output_size, activation='softmax'))
        
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

    def fit(self, X_train, y_train, validation_data, epochs=5, batch_size=64):
        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data, verbose=1)
        return history

    def evaluate(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test, verbose=1)
        return loss, accuracy

    def predict(self, X_test):
        return self.model.predict(X_test)


# Ensure the data is a numpy array
X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the custom LSTM model
vocab_size = 5000  # This can be adjusted based on the tokenizer settings
embedding_dim = 128
input_length = 100
hidden_dim = 100
output_size = 3  # For three classes: negative, neutral, positive

custom_lstm_model = CustomLSTMModel(vocab_size, embedding_dim, input_length, hidden_dim, output_size)
custom_lstm_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x236d6738150>

In [7]:
# EVALUATION

# Evaluate the model
loss, accuracy = custom_lstm_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss} | Test Accuracy: {accuracy}')

# Predict and print classification report
y_pred = np.argmax(custom_lstm_model.predict(X_test), axis=-1)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Test Loss: 0.7529652118682861 | Test Accuracy: 0.7147613763809204
Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.74      0.72      1149
     neutral       0.68      0.65      0.67      1648
    positive       0.75      0.76      0.75      1708

    accuracy                           0.71      4505
   macro avg       0.71      0.72      0.71      4505
weighted avg       0.71      0.71      0.71      4505



In [9]:
# PAS SUR QUON LE LAISSE MAIS PEUT ETRE INTERESSE POUR MONTRER QUON A TENTE PLUSIEURS MODELES
# 3. Model Building : simple regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


Accuracy: 0.39200887902330744
              precision    recall  f1-score   support

    negative       0.38      0.03      0.06      1149
     neutral       0.38      0.27      0.32      1648
    positive       0.40      0.75      0.52      1708

    accuracy                           0.39      4505
   macro avg       0.39      0.35      0.30      4505
weighted avg       0.39      0.39      0.33      4505



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
