In [1]:
pip install tensorflow




In [2]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing necessary packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report


In [4]:
# Load the dataframe
data = pd.read_csv('Data/Preprocessed_data_.csv')

# Create a copy of the dataframe
pre_df = data.copy()

# Handling missing values in the 'text' column
pre_df['text'] = pre_df['text'].fillna('')

# Drop rows with empty strings in the 'text' column
pre_df = pre_df[pre_df['text'] != '']

# Reset the index of the DataFrame
pre_df.reset_index(drop=True, inplace=True)


In [5]:
pre_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55868 entries, 0 to 55867
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       55868 non-null  int64 
 1   text             55868 non-null  object
 2   original_text    55868 non-null  object
 3   listed_emotions  55868 non-null  object
 4   emotion_count    55868 non-null  int64 
 5   labels           55868 non-null  object
 6   encoded_labels   55868 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 3.0+ MB


In [6]:
# Tokenization
tokenizer = Tokenizer()
texts = pre_df['text']
tokenizer.fit_on_texts(texts)  # 'texts' represents the text data
num_words = len(tokenizer.word_index) + 1  # Add 1 for the reserved 0 index
embedding_dim = 100
sequence_lengths = [len(tokens) for tokens in tokenizer.texts_to_sequences(texts)]
max_sequence_length = max(sequence_lengths)
num_classes = len(set(pre_df['labels']))


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [8]:
# Vectorization
vectorizer = TfidfVectorizer()
text_vectorized = vectorizer.fit_transform(pre_df['text'].values.astype('U'))
svd = TruncatedSVD(n_components=100)
text_vectorized_svd = svd.fit_transform(text_vectorized)
vectorized_df = pd.DataFrame(text_vectorized_svd)
pre_df_vectorized = pd.concat([pre_df, vectorized_df], axis=1)


In [9]:
# Split the data
X = text_vectorized_svd
y = pre_df['encoded_labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [10]:
# Reshape the input data
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])


In [11]:
# One-hot encode the target variable
encoder = OneHotEncoder()
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).toarray()


In [12]:
# Define the model architecture
model = Sequential()
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2, input_shape=(1, X_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))


In [13]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [14]:
# Train the model
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded),
          callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f0aa1f03d0>

In [15]:
# Evaluate the model
#y_pred = model.predict_classes(X_test)
#y_test_labels = np.argmax(y_test_encoded, axis=1)
#print(classification_report(y_test_labels, y_pred))


In [16]:
y_pred = model.predict(X_test)  
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

           0       0.48      0.33      0.39      6270
           1       0.39      0.17      0.24      6041
           2       0.52      0.80      0.63     10037

    accuracy                           0.50     22348
   macro avg       0.47      0.44      0.42     22348
weighted avg       0.48      0.50      0.46     22348



#### Fine Tuning the model

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define the model architecture with improvements
model = Sequential()
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2), input_shape=(1, X_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model with adjusted learning rate and optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_test, y_test_encoded),
          callbacks=[early_stopping])

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)
print(classification_report(y_test_labels, y_pred_labels))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
              precision    recall  f1-score   support

           0       0.49      0.32      0.39      6270
           1       0.41      0.46      0.43      6041
           2       0.61      0.69      0.64     10037

    accuracy                           0.52     22348
   macro avg       0.50      0.49      0.49     22348
weighted avg       0.52      0.52      0.52     22348

