# **GOAL**
We define a text classifier aimed at predicting users’ ideology.

To do so, we use GloVe pre-trained word embeddings and build a Bidirectional LSTM, training it on the ground truth of sample posts labeled with respect to their opinion on the controversy.

We will then use the model on the topics data to infer whether a user's opinion on the topic of interest is closer to pro-Trump or anti-Trump ideas

### **SET UP THE ENVIRONMENT**

In [1]:
pip install scikeras keras_tuner

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras>=3.2.0 (from scikeras)
  Downloading keras-3.4.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting namex (from keras>=3.2.0->scikeras)
  Downloading namex-0.0.8-py3-none-any.whl (5.8 kB)
Collecting optree (from keras>=3.2.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from keras.losses import sparse_categorical_crossentropy
from keras_tuner import HyperParameters, Objective, HyperModel, Hyperband, RandomSearch
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **LOAD AND PREPARE DATA**

In [11]:
file_path = '/content/drive/My Drive/Copia di polarized_reddit_posts_and_comments.csv'
data = pd.read_csv(file_path)

In [12]:
nan_count = data['content'].isna().sum()
print(f"Number of NaN values in 'content': {nan_count}")

Number of NaN values in 'content': 156


In [13]:
data.dropna(subset=['content'], inplace=True)  # Remove rows where 'content' is NaN

In [14]:
data['label'].value_counts(normalize=True)

label
0    0.639622
1    0.360378
Name: proportion, dtype: float64

### **RESAMPLE**

In [16]:
# Divide data for classes
data_majority = data[data.label == 0]
data_minority = data[data.label == 1]

# Under-sampling of the majority class
data_majority_downsampled = resample(data_majority,
                                     replace=False,   # Sample without replacement
                                     n_samples=len(data_minority),  # Match minority class size
                                     random_state=123) # for reproducibility

# Combines downsampled majority class with minority class
data_balanced = pd.concat([data_majority_downsampled, data_minority])

# Mixes data to avoid patterns in the training set
data_balanced = data_balanced.sample(frac=1).reset_index(drop=True)

In [17]:
data_balanced['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

### **TOKENISATION AND VECTORIZATION**

In [18]:
content_lengths = data_balanced['content'].apply(len)  # Compute length of each post/comment

mean_length = content_lengths.mean()

print(f"Lunghezza media: {mean_length}")

Lunghezza media: 188.14948734841778


In [19]:
# Tokenizer Settings
max_words = 20000  # max num of words in the vocabulary
max_len = int(round(mean_length + 50))  # max length of sequences

tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(data_balanced['content'])
sequences = tokenizer.texts_to_sequences(data_balanced['content'])
X = pad_sequences(sequences, maxlen=max_len)
y = data_balanced['label'].values

In [20]:
# Create a dictionary to maintain the word -> vector mapping
embeddings_index = {}

file_path = '/content/drive/My Drive/glove.6B.200d.txt'

# Load embedding vectors
with open(file_path, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [21]:
embedding_dim = 200

# Prepare embedding matrix
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector   # words not found in the vocabulary will be all-zeros

### **SPLIT THE DATASET**

In [22]:
# Split in training e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further divide training in training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

### **BUILD THE MODEL**

In [23]:
num_classes = 2

In [24]:
class LSTMBuilder(HyperModel):
    def __init__(self, max_words, max_len, num_classes):
        self.max_words = max_words
        self.max_len = max_len
        self.num_classes = num_classes

    def build(self, hp):
        input_layer = Input(shape=(self.max_len,))
        x = Embedding(self.max_words, embedding_dim, weights=[embedding_matrix], trainable=False)(input_layer)

        # Dynamically define LSTM layers
        num_lstm_layers = hp.Int('num_lstm_layers', 2, 4)
        for i in range(num_lstm_layers):
            lstm_units = hp.Int(f'lstm_units_{i}', 64, 192, step=64)
            x = LSTM(units=lstm_units, return_sequences=(i < num_lstm_layers - 1))(x)
            x = BatchNormalization()(x)
            x = Dropout(rate=hp.Float('dropout_rate', 0.2, 0.3, step=0.1))(x)

        # Define Dense layers
        num_dense_layers = hp.Int('num_dense_layers', 1, 2)
        for j in range(num_dense_layers):
            dense_units = hp.Int(f'dense_units_{j}', 64, 128, step=32)
            x = Dense(dense_units, activation='relu')(x)
            x = BatchNormalization()(x)
            x = Dropout(rate=hp.Float('dropout_rate', 0.2, 0.3, step=0.1))(x)

        output = Dense(1, activation='sigmoid')(x)
        model = Model(inputs=input_layer, outputs=output)
        model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

        return model


### **TUNING**

In [25]:
tuner = RandomSearch(
    LSTMBuilder(max_words=max_words, max_len=max_len, num_classes=2),
    objective=Objective('val_accuracy', direction='max'),
    max_trials=2,
    executions_per_trial=1,
    directory='my_dir',
    project_name='lstm_tuning'
)

tuner.search_space_summary()

Search space summary
Default search space size: 6
num_lstm_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 4, 'step': 1, 'sampling': 'linear'}
lstm_units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 192, 'step': 64, 'sampling': 'linear'}
dropout_rate (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.3, 'step': 0.1, 'sampling': 'linear'}
lstm_units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 192, 'step': 64, 'sampling': 'linear'}
num_dense_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 2, 'step': 1, 'sampling': 'linear'}
dense_units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 128, 'step': 32, 'sampling': 'linear'}


In [26]:
def run_tuning(X_train, y_train, X_val, y_val):

    tuner.search(X_train, y_train,
                 validation_data=(X_val, y_val),
                 epochs=7,
                 batch_size=128)

    best_hyperparams = tuner.get_best_hyperparameters(num_trials=1)[0]
    best_model = tuner.get_best_models(num_models=1)[0]

    print("Optimal Hyperparameters Found:")
    num_lstm_layers = best_hyperparams.get('num_lstm_layers')
    for i in range(num_lstm_layers):
        print(f"LSTM layer {i}: {best_hyperparams.get(f'lstm_units_{i}')} units")

    num_dense_layers = best_hyperparams.get('num_dense_layers')
    for j in range(num_dense_layers):
        print(f"Dense layer {j}: {best_hyperparams.get(f'dense_units_{j}')} units")

    print("Best Model Architecture:")
    print(best_model.summary())

    return best_model

In [27]:
best_model = run_tuning(X_train, y_train, X_val, y_val)

Trial 2 Complete [00h 21m 58s]
val_accuracy: 0.7822142839431763

Best val_accuracy So Far: 0.7866407632827759
Total elapsed time: 00h 37m 15s
Optimal Hyperparameters Found:
LSTM layer 0: 128 units
LSTM layer 1: 128 units
LSTM layer 2: 64 units
Dense layer 0: 96 units
Dense layer 1: 64 units
Best Model Architecture:


  saveable.load_own_variables(weights_store.get(inner_path))


None


### **RETRAIN THE BEST MODEL FOUND ON THE WHOLE TRAINING SET**

In [28]:
# Reunion of training and val sets
X_train_full = np.concatenate([X_train, X_val], axis=0)
y_train_full = np.concatenate([y_train, y_val], axis=0)

In [31]:
checkpoint_filepath = '/content/drive/My Drive/new/model_lstm.keras'

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1)

In [32]:
best_model.fit(X_train_full, y_train_full,
                epochs=7,
                validation_split=0.2,
                batch_size=128,
                callbacks=[model_checkpoint_callback]
               )

Epoch 1/7
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.8075 - loss: 0.4131
Epoch 1: val_accuracy improved from -inf to 0.78939, saving model to /content/drive/My Drive/new/model_lstm.keras
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 61ms/step - accuracy: 0.8075 - loss: 0.4131 - val_accuracy: 0.7894 - val_loss: 0.4442
Epoch 2/7
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.8154 - loss: 0.3994
Epoch 2: val_accuracy did not improve from 0.78939
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 63ms/step - accuracy: 0.8154 - loss: 0.3994 - val_accuracy: 0.7814 - val_loss: 0.4535
Epoch 3/7
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.8223 - loss: 0.3872
Epoch 3: val_accuracy did not improve from 0.78939
[1m1943/1943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 61ms/step - accuracy: 0.8223 - loss:

<keras.src.callbacks.history.History at 0x7b1a9a9d9ed0>

### **EVALUATE THE MODEL**

In [33]:
loss, accuracy = best_model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

[1m2429/2429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 12ms/step - accuracy: 0.7890 - loss: 0.4699
Test Loss: 0.4694609045982361
Test Accuracy: 0.789342999458313


### **SAVE TOKENIZER AND EMBEDDING MATRIX**

In [34]:
import pickle

# Save tokenizer
tokenizer_path = '/content/drive/My Drive/new/tokenizer.pickle'
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save embedding matrix
embedding_matrix_path = '/content/drive/My Drive/new/embedding_matrix.npy'
np.save(embedding_matrix_path, embedding_matrix)