In [1]:
!pip install lightgbm



In [6]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [8]:
df = pd.read_csv('MBTI_500.csv')
df['clean_posts'] = df['posts'].apply(clean_text)

In [9]:
# TF-IDF vectorized text
vectorizer = TfidfVectorizer(max_features=1000)  # 限制维度以加快训练
X_tfidf = vectorizer.fit_transform(df['clean_posts'])

In [10]:
# Label encoder
le = LabelEncoder()
y = le.fit_transform(df['type'])

In [11]:
# Divide the dataset
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [12]:
# Logistic Regression as baseline model
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

print("🔹 Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))

🔹 Logistic Regression Accuracy: 0.7655321957198077
              precision    recall  f1-score   support

        ENFJ       0.45      0.72      0.55       319
        ENFP       0.73      0.80      0.76      1249
        ENTJ       0.55      0.79      0.65       577
        ENTP       0.80      0.76      0.78      2324
        ESFJ       0.24      0.64      0.35        33
        ESFP       0.31      0.63      0.42        75
        ESTJ       0.37      0.74      0.49       105
        ESTP       0.65      0.87      0.74       398
        INFJ       0.82      0.74      0.78      2954
        INFP       0.79      0.76      0.77      2391
        INTJ       0.86      0.76      0.81      4531
        INTP       0.88      0.77      0.82      5033
        ISFJ       0.38      0.80      0.51       132
        ISFP       0.37      0.75      0.49       161
        ISTJ       0.39      0.79      0.52       253
        ISTP       0.64      0.79      0.71       679

    accuracy                 

In [14]:
import time

# LightGBM replaces Random Forest
lgbm = LGBMClassifier(
    objective='multiclass',
    num_class=16,
    learning_rate=0.1,
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Record training time
start_train = time.time()
lgbm.fit(X_train, y_train)
end_train = time.time()
train_time = end_train - start_train

# Record Predict time
start_pred = time.time()
y_pred_lgbm = lgbm.predict(X_val)
end_pred = time.time()
pred_time = end_pred - start_pred

# Result and training time
print("🔹 LightGBM Accuracy:", accuracy_score(y_val, y_pred_lgbm))
print(classification_report(y_val, y_pred_lgbm, target_names=le.classes_))
print(f"🕒 Training time: {train_time:.4f} seconds")
print(f"🕒 Prediction time: {pred_time:.4f} seconds")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.968187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 84853, number of used features: 1000
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[LightGBM] [Info] Start training from score -2.772589
[Ligh



🔹 LightGBM Accuracy: 0.7710945601960969
              precision    recall  f1-score   support

        ENFJ       0.56      0.61      0.58       319
        ENFP       0.69      0.80      0.74      1249
        ENTJ       0.57      0.75      0.64       577
        ENTP       0.74      0.76      0.75      2324
        ESFJ       0.88      0.45      0.60        33
        ESFP       0.70      0.43      0.53        75
        ESTJ       0.80      0.53      0.64       105
        ESTP       0.73      0.78      0.75       398
        INFJ       0.79      0.76      0.78      2954
        INFP       0.75      0.77      0.76      2391
        INTJ       0.83      0.79      0.81      4531
        INTP       0.86      0.79      0.82      5033
        ISFJ       0.59      0.58      0.59       132
        ISFP       0.58      0.67      0.62       161
        ISTJ       0.65      0.70      0.68       253
        ISTP       0.64      0.75      0.69       679

    accuracy                           0