In [11]:
import re
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
class LogisticRegressionModel:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None
        self.model = LogisticRegression(max_iter=1000, class_weight='balanced')
        self.le = LabelEncoder()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"<.*?>", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def load_and_prepare_data(self):
        self.df = pd.read_csv(self.csv_path)
        self.df['clean_posts'] = self.df['posts'].apply(self.clean_text)

        X_tfidf = TfidfVectorizer().fit_transform(self.df['clean_posts'])
        y = self.le.fit_transform(self.df['type'])

        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_tfidf, y, test_size=0.2, random_state=42)

    def train_and_evaluate(self):
        start_train = time.time()
        self.model.fit(self.X_train, self.y_train)
        train_time = time.time() - start_train

        start_pred = time.time()
        y_pred = self.model.predict(self.X_val)
        pred_time = time.time() - start_pred

        acc = accuracy_score(self.y_val, y_pred)
        report = classification_report(self.y_val, y_pred, target_names=self.le.classes_)

        print("Logistic Regression Accuracy:", acc)
        print(report)
        print(f"Training time: {train_time:.4f} seconds")
        print(f"Prediction time: {pred_time:.4f} seconds")

In [None]:
model = LogisticRegressionModel('MBTI_500.csv')
model.load_and_prepare_data()
model.train_and_evaluate()

Logistic Regression Accuracy: 0.8297822192891486
              precision    recall  f1-score   support

        ENFJ       0.68      0.78      0.73       319
        ENFP       0.76      0.83      0.80      1249
        ENTJ       0.75      0.88      0.81       577
        ENTP       0.84      0.83      0.84      2324
        ESFJ       0.59      0.70      0.64        33
        ESFP       0.63      0.69      0.66        75
        ESTJ       0.83      0.89      0.86       105
        ESTP       0.83      0.93      0.88       398
        INFJ       0.85      0.81      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.87      0.83      0.85      4531
        INTP       0.89      0.83      0.86      5033
        ISFJ       0.55      0.81      0.66       132
        ISFP       0.53      0.78      0.63       161
        ISTJ       0.57      0.84      0.68       253
        ISTP       0.79      0.87      0.83       679

    accuracy                   

In [12]:
class RandomForestModel:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.X_train = None
        self.X_val = None
        self.y_train = None
        self.y_val = None
        self.model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
        self.le = LabelEncoder()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"http\S+|www\S+", "", text)
        text = re.sub(r"<.*?>", "", text)
        text = re.sub(r"[^a-z\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def load_and_prepare_data(self):
        self.df = pd.read_csv(self.csv_path)
        self.df['clean_posts'] = self.df['posts'].apply(self.clean_text)

        X_tfidf = TfidfVectorizer().fit_transform(self.df['clean_posts'])
        y = self.le.fit_transform(self.df['type'])

        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            X_tfidf, y, test_size=0.2, random_state=42)

    def train_and_evaluate(self):
        start_train = time.time()
        self.model.fit(self.X_train, self.y_train)
        train_time = time.time() - start_train

        start_pred = time.time()
        y_pred = self.model.predict(self.X_val)
        pred_time = time.time() - start_pred

        acc = accuracy_score(self.y_val, y_pred)
        report = classification_report(self.y_val, y_pred, target_names=self.le.classes_)

        print("Random Forest Accuracy:", acc)
        print(report)
        print(f"Training time: {train_time:.4f} seconds")
        print(f"Prediction time: {pred_time:.4f} seconds")

In [13]:
model = RandomForestModel('MBTI_500.csv')
model.load_and_prepare_data()
model.train_and_evaluate()

Random Forest Accuracy: 0.5231450928632035
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00       319
        ENFP       0.85      0.10      0.18      1249
        ENTJ       1.00      0.16      0.27       577
        ENTP       0.78      0.23      0.35      2324
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.00      0.00      0.00        75
        ESTJ       0.92      0.67      0.77       105
        ESTP       0.94      0.68      0.79       398
        INFJ       0.57      0.55      0.56      2954
        INFP       0.74      0.33      0.46      2391
        INTJ       0.52      0.70      0.60      4531
        INTP       0.44      0.87      0.59      5033
        ISFJ       0.00      0.00      0.00       132
        ISFP       0.00      0.00      0.00       161
        ISTJ       0.00      0.00      0.00       253
        ISTP       1.00      0.02      0.03       679

    accuracy                         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
