In [1]:
!pip install -r requirements.txt



In [2]:
# Importing packages used in the code

import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score

### Pretreatment of the data

* We start by loading the data from a CSV and display the informations of the dataset to understand its structure. 
* Then, we define a function of pretreatment to clean tweets by removing special characters, URLs, mentions, tagging and hashtags, and deleting the stopwords.


In [3]:
# Importing data
current_dir = os.getcwd()
dataset_name = "fifa_world_cup_2022_tweets.csv"
file_path = os.path.join(current_dir, dataset_name)

dataset = pd.read_csv(file_path)
print(dataset.head())

   Unnamed: 0               Date Created  Number of Likes  \
0           0  2022-11-20 23:59:21+00:00                4   
1           1  2022-11-20 23:59:01+00:00                3   
2           2  2022-11-20 23:58:41+00:00                1   
3           3  2022-11-20 23:58:33+00:00                1   
4           4  2022-11-20 23:58:28+00:00                0   

       Source of Tweet                                              Tweet  \
0      Twitter Web App  What are we drinking today @TucanTribe \n@MadB...   
1   Twitter for iPhone  Amazing @CanadaSoccerEN  #WorldCup2022 launch ...   
2   Twitter for iPhone  Worth reading while watching #WorldCup2022 htt...   
3      Twitter Web App  Golden Maknae shinning bright\n\nhttps://t.co/...   
4  Twitter for Android  If the BBC cares so much about human rights, h...   

  Sentiment  
0   neutral  
1  positive  
2  positive  
3  positive  
4  negative  


In [4]:
# Keeping the tweets and sentiment columns only
dataset = dataset[['Tweet','Sentiment']]
dataset

Unnamed: 0,Tweet,Sentiment
0,What are we drinking today @TucanTribe \n@MadB...,neutral
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,Worth reading while watching #WorldCup2022 htt...,positive
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,"If the BBC cares so much about human rights, h...",negative
...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral
22521,Great thread to read before the start of #Worl...,positive
22522,Raphinha wants Brazil to be united at the #Wor...,positive


In [5]:
# Downloading the stopwords from NLTK 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess and clean the tweets
def preprocess_tweet(tweet):
    # Replacing the linebreaks by spaces
    tweet = re.sub(r'\n', ' ', tweet)
    # Replace carriage returns with spaces
    tweet = re.sub(r'\r', ' ', tweet)
    # Removing urls
    tweet = re.sub(r'http\S+|www\S+', '', tweet)
    # Removing mentions (@)
    tweet = re.sub(r'@\w+', '', tweet)
    # Removing hashtags
    tweet = re.sub(r'#\w+', '', tweet)
    # Removing special characters and keeping only the alphanumericals and spaces
    tweet = ''.join([char for char in tweet if char.isalnum() or char.isspace()])
    # Convert the tweets in lowercase
    tweet = tweet.lower()
    # Removing stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    return tweet

# Apply pretreatment to each tweet
tweets_df = dataset['Tweet'].apply(preprocess_tweet)

# Print pretreated tweets
print(tweets_df)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\flavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                                           drinking today
1        amazing launch video shows much face canada me...
2                                   worth reading watching
3                            golden maknae shinning bright
4        bbc cares much human rights homosexual rights ...
                               ...                        
22519                                    go world cup 2022
22520    anderlecht confirms former viborg ffs jesper f...
22521                              great thread read start
22522                         raphinha wants brazil united
22523    buy sot pinksale confused buy tokens pinksale ...
Name: Tweet, Length: 22524, dtype: object


### Tokenization and preparation of sequences

* We use Tokenizer from Keras to transform our tweets into sequences of tokens. The size of the vocabulary is increased to 1000 words. 
* The sequences are then filled (padding) with a fixed length of 120 tokens.

In [6]:
# Tokenisation : Transforming the tweets into sequences of tokens
tokenizer = Tokenizer(num_words=10000)  
tokenizer.fit_on_texts(tweets_df)
sequences = tokenizer.texts_to_sequences(tweets_df)

# Padding sequences : Normalizing the sequences to a fixed length of 60
padded_sequences = pad_sequences(sequences, maxlen=60)  # Increases the max length of sequences

# Converting sentiment labels into numerical values
dataset['Sentiment'] = dataset['Sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
y = dataset['Sentiment'].values
print(y)

[1 2 2 ... 2 2 1]


In [7]:
print(dataset['Sentiment'].unique())

[1 2 0]


### Definition and training of the LSTM model

* We define a LSTM bidirectional model with dropout layers to adjust the data and avoid overfitting.
* The model is compiled with the loss sparse_categorical_crossentropy and optimizer Adam.
* We train our model on the train data and then validate the model on the test data.

In [8]:
# Define a class for the customized LSTM model
class CustomLSTMModel:
    def __init__(self, vocab_size, embedding_dim, input_length, hidden_dim, output_size):
        # Initializing the sequential model
        self.model = Sequential()
        # Adding a layer of embedding
        self.model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
        # Adding a SpatialDropout to regularize embeddings
        self.model.add(SpatialDropout1D(0.3)) 
        # Adding a layer of bidirectional LSTM with dropout
        self.model.add(Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
        # Adding a layer of LSTM with dropout
        self.model.add(LSTM(hidden_dim, dropout=0.3, recurrent_dropout=0.3))
        # Adding a dense layer with softmax activation for classification
        self.model.add(Dense(output_size, activation='softmax'))
        
        # Compiling the model with the loss 'sparse_categorical_crossentropy' and optimizer Adam
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

    # Function to train the model 
    def fit(self, X_train, y_train, validation_data, epochs=10, batch_size=64): 
        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=validation_data, verbose=1)
        return history

    # Function to evaluate the model
    def evaluate(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test, verbose=1)
        return loss, accuracy

    # Function to predict classes of the new data
    def predict(self, X_test):
        return self.model.predict(X_test)

# Making sure all the data are numpy arrays
X = np.array(padded_sequences)
y = np.array(y)

# Splitting the data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

In [9]:
# Initializing and training the custom LSTM model

# Size of vocab adjusted
vocab_size = 10000 

# Dimension of embeddings 
embedding_dim = 50 #128 #50 

# Max length for sequences
input_length = 60 #120 #60

# Number of neurones in the hidden LSTM layer
hidden_dim = 32 #128 #32

# For 3 classes : negative, neutral, positive
output_size = 3  

# Number of epochs for training
epochs= 4 #10 #4

# Size of batches
batch_size = 16 #64 #16


custom_lstm_model = CustomLSTMModel(vocab_size, embedding_dim, input_length, hidden_dim, output_size)
custom_lstm_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2a7e278ff90>

Interpretation :

* The loss in training decreases well, reaching 0.4151, and the accuracy of training increases to 83.39%.
* The loss in validation keeps rising slighlty, reaching 0.7056, which reinforces the overfitting indicator.
* The validation accuracy stays stable at around 70.72%, indicating that despite the increase in performance on the training data, the model does not significatively improve on the validation data.

### Evaluation and Prediction

* We evaluate the model on the test data to obtain the loss and precision.
* We predict the sentiments in the test tweets and display a classification report in order to evaluate the model performance.
* For comparaison, we also train a logistic regression model and compare its performances with our custom LSTM model.


In [12]:
# Evaluating the model
loss, accuracy = custom_lstm_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss} | Test Accuracy: {accuracy}')

# Prediction and displaying the classification report
y_pred = np.argmax(custom_lstm_model.predict(X_test), axis=-1)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

# Logistic regression model to use as comparison
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_train)
x_test_scaled = scaler.transform(X_test)

# Training the logistic regression model
logistic_model = LogisticRegression(max_iter=500)
logistic_model.fit(x_train_scaled, y_train)

# Predicting and evaluating 
y_pred_logistic = logistic_model.predict(x_test_scaled)
print("----------------------------------------------------------------------------")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print(classification_report(y_test, y_pred_logistic, target_names=['negative', 'neutral', 'positive']))


Test Loss: 0.7056096792221069 | Test Accuracy: 0.7072141766548157
Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.70      0.71      1149
     neutral       0.65      0.68      0.67      1648
    positive       0.75      0.74      0.74      1708

    accuracy                           0.71      4505
   macro avg       0.71      0.71      0.71      4505
weighted avg       0.71      0.71      0.71      4505

----------------------------------------------------------------------------
Logistic Regression Accuracy: 0.3849056603773585
              precision    recall  f1-score   support

    negative       0.24      0.03      0.05      1149
     neutral       0.37      0.28      0.32      1648
    positive       0.40      0.73      0.51      1708

    accuracy                           0.38      4505
   macro avg       0.34      0.34      0.29      4505
weighted avg       0.35      0.38      0.32      4505



### Interpretation

<u>LSTM model</u>:

* The accuracy of the test of 70.72% is coherent with the validation accuracy observed during training, suggesting that the model generalizes well our test data.
  * Test Loss : 70.56%
  * Test Accuracy : 70.72%

* The 'positive' class seems to be more precise overall.

<u>Logistic regression model</u>:

* The accuracy of the logistic regression model is inferior to the accuracy of the LSTM model, indicating that the logistic regression seems to struggle capturing the complex relationships between the tweets and sentiments.

* The performance of the logistic regression model is especially poor for the 'negative' class, with a recall of only 3%. These results suggest that the logistic regression model cannot capture the specificities of the negative tweets.


The accuracy of the linear regression model is extremely inferior to the accuracy of the custom LSTM model, indicating again that the logistic regression struggles to capture the relation between the tweets and sentiments.
* Test Accuracy Logistic Regression: 38.49%
* Test Accuracy LSTM model: 70.72%