In [3]:
import re

def clean_text(text):
    text = text.lower()  # 全部小写
    text = re.sub(r"http\S+|www\S+", "", text)  # 移除链接
    text = re.sub(r"<.*?>", "", text)           # 移除 HTML 标签
    text = re.sub(r"[^a-z\s]", "", text)        # 只保留英文字母和空格
    text = re.sub(r"\s+", " ", text).strip()    # 多余空格合并
    return text


In [5]:
import pandas as pd
df = pd.read_csv('MBTI_500.csv')
df['clean_posts'] = df['posts'].apply(clean_text)

In [7]:
# TF-IDF 向量化文本
from sklearn.feature_extraction.text import TfidfVectorizer
X_tfidf = TfidfVectorizer().fit_transform(df['clean_posts'])

In [8]:
# 标签编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['type'])  # 将 16 个 MBTI 类型转为 0-15 的整数标签

In [9]:
#  划分数据集
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [10]:
# 训练模型 + 评估（LR 和 RF）
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)

print("🔹 Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))

# Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

print("🔹 Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf, target_names=le.classes_))


🔹 Logistic Regression Accuracy: 0.8302536061091732
              precision    recall  f1-score   support

        ENFJ       0.68      0.78      0.73       319
        ENFP       0.77      0.84      0.80      1249
        ENTJ       0.75      0.88      0.81       577
        ENTP       0.84      0.83      0.84      2324
        ESFJ       0.59      0.70      0.64        33
        ESFP       0.64      0.69      0.67        75
        ESTJ       0.82      0.90      0.86       105
        ESTP       0.84      0.93      0.88       398
        INFJ       0.85      0.80      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.87      0.83      0.85      4531
        INTP       0.89      0.83      0.86      5033
        ISFJ       0.55      0.81      0.65       132
        ISFP       0.53      0.78      0.63       161
        ISTJ       0.56      0.85      0.68       253
        ISTP       0.79      0.87      0.83       679

    accuracy                 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
