In [1]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [2]:
import pandas as pd
df = pd.read_csv('MBTI_500.csv')
df['clean_posts'] = df['posts'].apply(clean_text)

In [3]:
# TF-IDF vectorized text
from sklearn.feature_extraction.text import TfidfVectorizer
X_tfidf = TfidfVectorizer().fit_transform(df['clean_posts'])

In [4]:
# Label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['type'])

In [5]:
#  Divide dataset
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [6]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
start_train = time.time()
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
end_train = time.time()
train_time_lr = end_train - start_train

start_pred = time.time()
y_pred_lr = lr.predict(X_val)
end_pred = time.time()
pred_time_lr = end_pred - start_pred

# Print results
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))
print(f"LR Training time: {train_time_lr:.4f} seconds")
print(f"LR Prediction time: {pred_time_lr:.4f} seconds")

Logistic Regression Accuracy: 0.8297822192891486
              precision    recall  f1-score   support

        ENFJ       0.68      0.78      0.73       319
        ENFP       0.76      0.83      0.80      1249
        ENTJ       0.75      0.88      0.81       577
        ENTP       0.84      0.83      0.84      2324
        ESFJ       0.59      0.70      0.64        33
        ESFP       0.63      0.69      0.66        75
        ESTJ       0.83      0.89      0.86       105
        ESTP       0.83      0.93      0.88       398
        INFJ       0.85      0.81      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.87      0.83      0.85      4531
        INTP       0.89      0.83      0.86      5033
        ISFJ       0.55      0.81      0.66       132
        ISFP       0.53      0.78      0.63       161
        ISTJ       0.57      0.84      0.68       253
        ISTP       0.79      0.87      0.83       679

    accuracy                   

In [10]:
import time

# Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Record training time
start_train = time.time()
rf.fit(X_train, y_train)
end_train = time.time()
train_time_rf = end_train - start_train

# Record Predict time
start_pred = time.time()
y_pred_rf = rf.predict(X_val)
end_pred = time.time()
pred_time_rf = end_pred - start_pred

# Result and running time
print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf, target_names=le.classes_))
print(f"RF Training time: {train_time_rf:.4f} seconds")
print(f"RF Prediction time: {pred_time_rf:.4f} seconds")

Random Forest Accuracy: 0.5231450928632035
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       319
        ENFP       0.85      0.10      0.18      1249
        ENTJ       1.00      0.16      0.27       577
        ENTP       0.78      0.23      0.35      2324
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        75
        ESTJ       0.92      0.67      0.77       105
        ESTP       0.94      0.68      0.79       398
        INFJ       0.57      0.55      0.56      2954
        INFP       0.74      0.33      0.46      2391
        INTJ       0.52      0.70      0.60      4531
        INTP       0.44      0.87      0.59      5033
        ISFJ       0.00      0.00      0.00       132
        ISFP       0.00      0.00      0.00       161
        ISTJ       0.00      0.00      0.00       253
        ISTP       1.00      0.02      0.03       679

    accuracy                         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
