In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import train_test_split
from keras.layers import Bidirectional, Dropout
from keras.regularizers import l2
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, BatchNormalization
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv("author_profiling_v3.csv")
df['tweets_lemmatized'] = df['tweets_lemmatized'].astype(str)

In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()

# The target values are in the columns 'ext', 'neu', 'agr', 'con', 'ope'
y = df[['ext', 'neu', 'agr', 'con', 'ope']].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['tweets_lemmatized'], y, test_size=0.2, random_state=42)

# Fit the tokenizer on the TRAINING tweets
tokenizer.fit_on_texts(X_train)

# Transform the TRAINING tweets into sequences of integers
sequences_train = tokenizer.texts_to_sequences(X_train)

# Transform the TEST tweets into sequences of integers
sequences_test = tokenizer.texts_to_sequences(X_test)

# Pad the TRAINING sequences so they all have the same length
X_train = pad_sequences(sequences_train, maxlen=max_sequence_length)

# Pad the TEST sequences so they all have the same length
X_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
# Define the BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(5, kernel_regularizer=l2(0.03)))

# Compile the model
model.compile(loss='mean_squared_error', optimizer=Adam())

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Add these callbacks to the fit() function
model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss = model.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [None]:
# RMSE MAE MSE R^2 for training dataset to evaluate if the model is overfitting or underfitting
predictions =model.predict(X_train)

trait_names = ['ext', 'neu', 'agr', 'con', 'ope']

# Loop over each trait
for i in range(5):
    # Select the true and predicted values for this trait
    y_true = y_train[:, i]
    y_pred = predictions[:, i]

    # Compute metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # RMSE is just the square root of MSE
    r2 = r2_score(y_true, y_pred)

    # Print results
    print(f"{trait_names[i]}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R^2:  {r2:.4f}")


ext:
  MAE:  0.1011
  MSE:  0.0180
  RMSE: 0.1341
  R^2:  0.3160
neu:
  MAE:  0.1153
  MSE:  0.0251
  RMSE: 0.1585
  R^2:  0.5231
agr:
  MAE:  0.0862
  MSE:  0.0138
  RMSE: 0.1176
  R^2:  0.4161
con:
  MAE:  0.0864
  MSE:  0.0132
  RMSE: 0.1150
  R^2:  0.4185
ope:
  MAE:  0.0834
  MSE:  0.0123
  RMSE: 0.1108
  R^2:  0.4823


In [None]:
# Make predictions
predictions = model.predict(X_test)

trait_names = ['ext', 'neu', 'agr', 'con', 'ope']

# Loop over each trait
for i in range(5):
    # Select the true and predicted values for this trait
    y_true = y_test[:, i]
    y_pred = predictions[:, i]

    # Compute metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # RMSE is just the square root of MSE
    r2 = r2_score(y_true, y_pred)

    # Print results
    print(f"{trait_names[i]}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R^2:  {r2:.4f}")

ext:
  MAE:  0.1357
  MSE:  0.0308
  RMSE: 0.1756
  R^2:  -0.1700
neu:
  MAE:  0.1818
  MSE:  0.0534
  RMSE: 0.2311
  R^2:  -0.0406
agr:
  MAE:  0.1226
  MSE:  0.0260
  RMSE: 0.1614
  R^2:  -0.0821
con:
  MAE:  0.1189
  MSE:  0.0224
  RMSE: 0.1498
  R^2:  -0.0023
ope:
  MAE:  0.1232
  MSE:  0.0236
  RMSE: 0.1537
  R^2:  0.0047


In [None]:
# Convert the predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=trait_names)

# Compute the correlation matrix
correlation_matrix = predictions_df.corr()

# Print the correlation matrix
print(correlation_matrix)

          ext       neu       agr       con       ope
ext  1.000000  0.395092  0.234762  0.188441 -0.139757
neu  0.395092  1.000000  0.309274 -0.179619 -0.141880
agr  0.234762  0.309274  1.000000  0.100772 -0.078458
con  0.188441 -0.179619  0.100772  1.000000  0.204890
ope -0.139757 -0.141880 -0.078458  0.204890  1.000000


In [None]:
correlation_matrix = df[['ext', 'neu', 'agr', 'con', 'ope']].corr()
print(correlation_matrix)

          ext       neu       agr       con       ope
ext  1.000000  0.294476  0.145334  0.192219  0.020805
neu  0.294476  1.000000  0.325530  0.021377 -0.029465
agr  0.145334  0.325530  1.000000  0.070499 -0.004108
con  0.192219  0.021377  0.070499  1.000000  0.071473
ope  0.020805 -0.029465 -0.004108  0.071473  1.000000
