In [1]:
import re
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
class LogisticRegressionOpt:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None
        self.le = LabelEncoder()
        self.model = LogisticRegression(
            solver='lbfgs',
            max_iter=300,
            tol=1e-4,
            class_weight='balanced',
            n_jobs=-1
        )

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"<.*?>", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def load_and_prepare_data(self):
        self.df = pd.read_csv(self.csv_path)
        self.df['clean_posts'] = self.df['posts'].apply(self.clean_text)

        X_tfidf = TfidfVectorizer().fit_transform(self.df['clean_posts'])
        y = self.le.fit_transform(self.df['type'])

        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_tfidf, y, test_size=0.2, random_state=42
        )

    def train_and_evaluate(self):
        start_train = time.time()
        self.model.fit(self.X_train, self.y_train)
        train_time = time.time() - start_train

        start_pred = time.time()
        y_pred = self.model.predict(self.X_val)
        pred_time = time.time() - start_pred

        acc = accuracy_score(self.y_val, y_pred)
        report = classification_report(self.y_val, y_pred, target_names=self.le.classes_)

        print("Logistic Regression Accuracy:", acc)
        print(report)
        print(f"Training time: {train_time:.4f} seconds")
        print(f"Prediction time: {pred_time:.4f} seconds")

In [3]:
model = LogisticRegressionOpt('MBTI_500.csv')
model.load_and_prepare_data()
model.train_and_evaluate()

Logistic Regression Accuracy: 0.8297822192891486
              precision    recall  f1-score   support

        ENFJ       0.68      0.78      0.73       319
        ENFP       0.76      0.83      0.80      1249
        ENTJ       0.75      0.88      0.81       577
        ENTP       0.84      0.83      0.84      2324
        ESFJ       0.59      0.70      0.64        33
        ESFP       0.63      0.69      0.66        75
        ESTJ       0.83      0.89      0.86       105
        ESTP       0.83      0.93      0.88       398
        INFJ       0.85      0.81      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.87      0.83      0.85      4531
        INTP       0.89      0.83      0.86      5033
        ISFJ       0.55      0.81      0.66       132
        ISFP       0.53      0.78      0.63       161
        ISTJ       0.57      0.84      0.68       253
        ISTP       0.79      0.87      0.83       679

    accuracy                   

In [4]:
import re
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, classification_report

In [5]:
class RandomForestOpt:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None
        self.le = LabelEncoder()
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=15,
            min_samples_leaf=2,
            class_weight='balanced',
            n_jobs=-1,
            random_state=42
        )
        self.svd = TruncatedSVD(n_components=300, random_state=42)

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"<.*?>", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def load_and_prepare_data(self):
        self.df = pd.read_csv(self.csv_path)
        self.df['clean_posts'] = self.df['posts'].apply(self.clean_text)

        X_tfidf = TfidfVectorizer().fit_transform(self.df['clean_posts'])
        y = self.le.fit_transform(self.df['type'])

        X_train, X_val, y_train, y_val = train_test_split(
            X_tfidf, y, test_size=0.2, random_state=42
        )

        self.X_train = self.svd.fit_transform(X_train)
        self.X_val = self.svd.transform(X_val)
        self.y_train = y_train
        self.y_val = y_val

    def train_and_evaluate(self):
        start_train = time.time()
        self.model.fit(self.X_train, self.y_train)
        train_time = time.time() - start_train

        start_pred = time.time()
        y_pred = self.model.predict(self.X_val)
        pred_time = time.time() - start_pred

        acc = accuracy_score(self.y_val, y_pred)
        report = classification_report(self.y_val, y_pred, target_names=self.le.classes_)

        print("Random Forest Accuracy:", acc)
        print(report)
        print(f"Training time: {train_time:.4f} seconds")
        print(f"Prediction time: {pred_time:.4f} seconds")

In [6]:
model = RandomForestOpt('MBTI_500.csv')
model.load_and_prepare_data()
model.train_and_evaluate()

Random Forest Accuracy: 0.7026491939285378
              precision    recall  f1-score   support

        ENFJ       0.71      0.38      0.50       319
        ENFP       0.59      0.76      0.67      1249
        ENTJ       0.68      0.72      0.70       577
        ENTP       0.71      0.65      0.67      2324
        ESFJ       0.50      0.12      0.20        33
        ESFP       0.80      0.21      0.34        75
        ESTJ       0.91      0.71      0.80       105
        ESTP       0.79      0.87      0.83       398
        INFJ       0.70      0.73      0.72      2954
        INFP       0.63      0.73      0.68      2391
        INTJ       0.73      0.73      0.73      4531
        INTP       0.75      0.71      0.73      5033
        ISFJ       0.81      0.33      0.46       132
        ISFP       0.51      0.40      0.45       161
        ISTJ       0.70      0.39      0.50       253
        ISTP       0.70      0.65      0.67       679

    accuracy                         