# MBTI Classifier

**0: Setup**

In [None]:

%pip install -q pandas scikit-learn numpy matplotlib wordcloud joblib seaborn
# Optional (for the Streamlit app scaffold later):
%pip install -q streamlit

**Step 1: Imports**

In [None]:
import os, re, json, math, textwrap, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from joblib import dump, load

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

**2: Data Loading**

Data is imported from Kaggle: mbti.csv

- type: MBTI string (e.g., `INTJ`)
- posts: long text with posts concatenated

In [None]:
# loading MBTI.csv from local (used for training)

data_path = "mbti_1.csv"  # use local path

assert os.path.exists(data_path), f"CSV not found at {data_path}. Put your dataset there or update the path."

df = pd.read_csv(data_path)
# normalilze column names
df.columns = [c.strip().lower() for c in df.columns]

# handle two common schemas
if 'posts' in df.columns:
    df.rename(columns={'posts':'text'}, inplace=True)
elif 'text' not in df.columns:
    raise ValueError("Dataset must have a 'text' column (or 'posts' which will be renamed to 'text').")

if 'type' not in df.columns:
    raise ValueError("Dataset must include MBTI label in a 'type' column for this skeleton.")

df = df[['type','text']].dropna().copy()
df['type'] = df['type'].str.upper().str.strip()

In [None]:
# checking

print(df.head(2))
print(df.type.value_counts().head())
print("Rows:", len(df))

**3: Data Analysis**

Very brief MBTI distribution analysis

In [None]:

# distrubition of MBTI types
type_counts = df['type'].value_counts().sort_values(ascending=False)
ax = type_counts.plot(kind='bar', figsize=(10,3))
ax.set_title("MBTI type distribution")
ax.set_ylabel("Count")
plt.show()

# histogram for text length
df['len'] = df['text'].astype(str).apply(len)
ax = df['len'].plot(kind='hist', bins=40, figsize=(10,3))
ax.set_title("Text length distribution")
ax.set_xlabel("Characters")
plt.show()
