In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Importing necessary packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report


In [None]:
# Load the dataframe
data = pd.read_csv('Data/preprocessed_data.csv')

# Create a copy of the dataframe
pre_df = data.copy()

# Handling missing values in the 'text' column
pre_df['text'] = pre_df['text'].fillna('')

# Drop rows with empty strings in the 'text' column
pre_df = pre_df[pre_df['text'] != '']

# Reset the index of the DataFrame
pre_df.reset_index(drop=True, inplace=True)



In [None]:
pre_df.info()

In [None]:
# Tokenization
tokenizer = Tokenizer()
texts = pre_df['text']
tokenizer.fit_on_texts(texts)  # 'texts' represents the text data
num_words = len(tokenizer.word_index) + 1  # Add 1 for the reserved 0 index
embedding_dim = 100
sequence_lengths = [len(tokens) for tokens in tokenizer.texts_to_sequences(texts)]
max_sequence_length = max(sequence_lengths)
num_classes = len(set(pre_df['labels']))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
# Vectorization
vectorizer = TfidfVectorizer()
text_vectorized = vectorizer.fit_transform(pre_df['text'].values.astype('U'))
svd = TruncatedSVD(n_components=100)
text_vectorized_svd = svd.fit_transform(text_vectorized)
vectorized_df = pd.DataFrame(text_vectorized_svd)
pre_df_vectorized = pd.concat([pre_df, vectorized_df], axis=1)


In [None]:
# Split the data
X = text_vectorized_svd
y = pre_df['encoded_labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [None]:
# Reshape the input data
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])


In [None]:
# One-hot encode the target variable
encoder = OneHotEncoder()
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1)).toarray()
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).toarray()


In [None]:
# Define the model architecture
model = Sequential()
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2, input_shape=(1, X_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test, y_test_encoded),
          callbacks=[early_stopping])

In [None]:
# Evaluate the model
#y_pred = model.predict_classes(X_test)
#y_test_labels = np.argmax(y_test_encoded, axis=1)
#print(classification_report(y_test_labels, y_pred))


In [None]:
y_pred = model.predict(X_test)  
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)

print(classification_report(y_test_labels, y_pred_labels))


#### Fine Tuning the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define the model architecture with improvements
model = Sequential()
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2), input_shape=(1, X_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model with adjusted learning rate and optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train, y_train_encoded, epochs=40, batch_size=32, validation_data=(X_test, y_test_encoded),
          callbacks=[early_stopping])

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)
print(classification_report(y_test_labels, y_pred_labels))


#### 2nd Fine Tuning Attempt 

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Compute class weights
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)

# Convert class weights to a dictionary format
class_weights_dict = dict(enumerate(class_weights))

# Define the model architecture with improvements
model = Sequential()
model.add(Bidirectional(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2), input_shape=(1, X_train.shape[2])))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model with class weights
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], class_weight=class_weights_dict)

# Train the model with early stopping
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train, y_train_encoded, epochs=20, batch_size=32, validation_data=(X_test, y_test_encoded),
          callbacks=[early_stopping])

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)
print(classification_report(y_test_labels, y_pred_labels))
