In [None]:
!pip install lightgbm



In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [3]:
df = pd.read_csv('cleaned_mbti_parallel.csv')
df['clean_posts'] = df['POST'].apply(clean_text)

In [4]:
# TF-IDF vectorized text
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['clean_posts'])

In [6]:
# Label encoder
le = LabelEncoder()
y = le.fit_transform(df['MBTI'])

In [7]:
# Divide the dataset
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [8]:
# Logistic Regression as baseline model
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

print("🔹 Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))

🔹 Logistic Regression Accuracy: 0.10941427351110182
              precision    recall  f1-score   support

        ENFJ       0.03      0.15      0.04      1694
        ENFP       0.15      0.14      0.14      9193
        ENTJ       0.05      0.14      0.07      3732
        ENTP       0.21      0.08      0.12     16266
        ESFJ       0.00      0.16      0.00       185
        ESFP       0.01      0.22      0.02       496
        ESTJ       0.01      0.23      0.01       406
        ESTP       0.02      0.20      0.03       858
        INFJ       0.31      0.12      0.18     21384
        INFP       0.25      0.14      0.18     18077
        INTJ       0.39      0.10      0.16     37718
        INTP       0.44      0.10      0.16     44851
        ISFJ       0.01      0.12      0.02       715
        ISFP       0.01      0.14      0.03       899
        ISTJ       0.03      0.20      0.04      1714
        ISTP       0.05      0.09      0.06      4533

    accuracy                

In [9]:
import time

# LightGBM replaces Random Forest
lgbm = LGBMClassifier(
    objective='multiclass',
    num_class=16,
    learning_rate=0.1,
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Record training time
start_train = time.time()
lgbm.fit(X_train, y_train)
end_train = time.time()
train_time = end_train - start_train

# Record Predict time
start_pred = time.time()
y_pred_lgbm = lgbm.predict(X_val)
end_pred = time.time()
pred_time = end_pred - start_pred

# Result and training time
print("🔹 LightGBM Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print(classification_report(y_val, y_pred_lgbm, target_names=le.classes_))
print(f" Training time: {train_time:.4f} seconds")
print(f" Prediction time: {pred_time:.4f} seconds")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 14.177881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 650881, number of used features: 1000
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.



🔹 LightGBM Accuracy: 0.17038366283393047
              precision    recall  f1-score   support

        ENFJ       0.02      0.09      0.03      1694
        ENFP       0.13      0.16      0.14      9193
        ENTJ       0.04      0.13      0.07      3732
        ENTP       0.20      0.14      0.16     16266
        ESFJ       0.00      0.08      0.01       185
        ESFP       0.01      0.16      0.02       496
        ESTJ       0.01      0.13      0.02       406
        ESTP       0.01      0.12      0.03       858
        INFJ       0.28      0.18      0.22     21384
        INFP       0.23      0.21      0.22     18077
        INTJ       0.36      0.17      0.23     37718
        INTP       0.41      0.18      0.25     44851
        ISFJ       0.01      0.05      0.01       715
        ISFP       0.02      0.11      0.03       899
        ISTJ       0.03      0.20      0.05      1714
        ISTP       0.05      0.09      0.06      4533

    accuracy                           