In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, BatchNormalization
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import train_test_split
from keras.layers import Bidirectional, Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from sklearn.model_selection import KFold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Thesis Personality computation-Yucheng Chen/pandora_v4.csv", encoding='latin-1')


In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()
df['lemmatized_body'] = df['lemmatized_body'].astype(str)
max_sequence_length = 128
y = df[['ext', 'neu', 'agr', 'con', 'ope']].values
# Split your data first
X_train, X_test, y_train, y_test = train_test_split(df['lemmatized_body'], y, test_size=0.3, random_state=42)

# Fit the tokenizer on the TRAINING tweets
tokenizer.fit_on_texts(X_train)

# Transform the TRAINING tweets into sequences of integers
sequences_train = tokenizer.texts_to_sequences(X_train)

# Transform the TEST tweets into sequences of integers
sequences_test = tokenizer.texts_to_sequences(X_test)

# Pad the TRAINING sequences so they all have the same length
X_train = pad_sequences(sequences_train, maxlen=max_sequence_length)

# Pad the TEST sequences so they all have the same length
X_test = pad_sequences(sequences_test, maxlen=max_sequence_length)


In [None]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
# Define the BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128)))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(5, kernel_regularizer=l2(0.05)))

# Compile the model
model.compile(loss='mean_squared_error', optimizer=Adam())

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Add these callbacks to the fit() function
model.fit(X_train, y_train, epochs=10, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss = model.evaluate(X_test, y_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
# RMSE MAE MSE R^2 for training dataset to evaluate if the model is overfitting or underfitting
predictions =model.predict(X_train)

trait_names = ['ext', 'neu', 'agr', 'con', 'ope']

# Loop over each trait
for i in range(5):
    # Select the true and predicted values for this trait
    y_true = y_train[:, i]
    y_pred = predictions[:, i]

    # Compute metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # RMSE is just the square root of MSE
    r2 = r2_score(y_true, y_pred)

    # Print results
    print(f"{trait_names[i]}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R^2:  {r2:.4f}")

ext:
  MAE:  0.1364
  MSE:  0.0370
  RMSE: 0.1924
  R^2:  0.7129
neu:
  MAE:  0.1178
  MSE:  0.0307
  RMSE: 0.1751
  R^2:  0.6817
agr:
  MAE:  0.1097
  MSE:  0.0287
  RMSE: 0.1694
  R^2:  0.6985
con:
  MAE:  0.0944
  MSE:  0.0214
  RMSE: 0.1461
  R^2:  0.6943
ope:
  MAE:  0.1245
  MSE:  0.0250
  RMSE: 0.1583
  R^2:  0.5398


In [None]:
# Make predictions
predictions = model.predict(X_test)



trait_names = ['ext', 'neu', 'agr', 'con', 'ope']

# Loop over each trait
for i in range(5):
    # Select the true and predicted values for this trait
    y_true = y_test[:, i]
    y_pred = predictions[:, i]

    # Compute metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  # RMSE is just the square root of MSE
    r2 = r2_score(y_true, y_pred)

    # Print results
    print(f"{trait_names[i]}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R^2:  {r2:.4f}")

ext:
  MAE:  0.2413
  MSE:  0.1033
  RMSE: 0.3214
  R^2:  0.1915
neu:
  MAE:  0.2295
  MSE:  0.0937
  RMSE: 0.3061
  R^2:  0.0090
agr:
  MAE:  0.2067
  MSE:  0.0811
  RMSE: 0.2849
  R^2:  0.1194
con:
  MAE:  0.1663
  MSE:  0.0573
  RMSE: 0.2395
  R^2:  0.1730
ope:
  MAE:  0.1686
  MSE:  0.0484
  RMSE: 0.2200
  R^2:  0.0960


In [None]:
# Convert the predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=trait_names)

# Compute the correlation matrix
correlation_matrix = predictions_df.corr()

# Print the correlation matrix
print(correlation_matrix)

          ext       neu       agr       con       ope
ext  1.000000 -0.586961 -0.475499  0.117677  0.326704
neu -0.586961  1.000000 -0.051451 -0.350518  0.003354
agr -0.475499 -0.051451  1.000000  0.416910 -0.177667
con  0.117677 -0.350518  0.416910  1.000000 -0.602274
ope  0.326704  0.003354 -0.177667 -0.602274  1.000000
