In [19]:
import pandas as pd
import re
import time
import matplotlib.pyplot as plt

In [20]:
# --- Start total timer ---
start_total = time.time()

# --- Load data ---
start = time.time()
df = pd.read_csv('mbti_full_pull.csv')  # ⬅️ Replace with actual path
original_count = len(df)
print(f"✅ Loaded dataset with {original_count} rows in {time.time() - start:.2f}s")

# --- Define MBTI types and pattern ---
mbti_types = [
    'INTJ', 'INTP', 'ENTJ', 'ENTP',
    'INFJ', 'INFP', 'ENFJ', 'ENFP',
    'ISTJ', 'ISFJ', 'ESTJ', 'ESFJ',
    'ISTP', 'ISFP', 'ESTP', 'ESFP'
]
mbti_pattern = r'\b(' + '|'.join(mbti_types) + r')\b'

# --- Step 1: Extract MBTI from flair ---
start = time.time()
def extract_mbti(text):
    match = re.findall(mbti_pattern, str(text).upper())
    return match[0] if match else None

df['MBTI'] = df['author_flair_text'].apply(extract_mbti)
before_mbti_filter = len(df)
df = df.dropna(subset=['MBTI'])
removed_no_mbti = before_mbti_filter - len(df)
print(f"❌ Removed invalid MBTI: {removed_no_mbti} rows in {time.time() - start:.2f}s")

# --- Step 2: Remove posts with MBTI mentions ---
start = time.time()
df['body'] = df['body'].astype(str)
before_mbti_in_post = len(df)
df = df[~df['body'].str.upper().str.contains(mbti_pattern)]
removed_mbti_mentions = before_mbti_in_post - len(df)
print(f"❌ Removed posts mentioning MBTI: {removed_mbti_mentions} rows in {time.time() - start:.2f}s")

# --- Step 3: Remove empty posts ---
start = time.time()
before_empty = len(df)
df = df[df['body'].str.strip() != '']
removed_empty = before_empty - len(df)
print(f"❌ Removed empty posts: {removed_empty} rows in {time.time() - start:.2f}s")

# --- Step 4: Remove short posts ---
start = time.time()
df['word_count'] = df['body'].apply(lambda x: len(x.strip().split()))
min_words = 10
before_short = len(df)
df = df[df['word_count'] >= min_words]
removed_short = before_short - len(df)
print(f"❌ Removed short posts (< {min_words} words): {removed_short} rows in {time.time() - start:.2f}s")

# --- Step 5: Remove duplicates ---
start = time.time()
before_duplicates = len(df)
df = df.drop_duplicates(subset='body')
removed_duplicates = before_duplicates - len(df)
print(f"❌ Removed duplicates: {removed_duplicates} rows in {time.time() - start:.2f}s")

# --- Step 6: Normalize post text ---
start = time.time()
df['POST'] = df['body'].str.lower().str.strip()
print(f"✅ Normalized post text in {time.time() - start:.2f}s")

# --- Final cleanup ---
df_cleaned = df[['MBTI', 'POST']].reset_index(drop=True)

# --- Total summary ---
end_total = time.time()
print("\n📊 Final Summary:")
print(f"✅ Final cleaned dataset has {len(df_cleaned)} rows")
print(f"⏱️ Total preprocessing time: {end_total - start_total:.2f}s")

✅ Loaded dataset with 1794016 rows in 6.87s
❌ Removed invalid MBTI: 137091 rows in 1.98s


  df = df[~df['body'].str.upper().str.contains(mbti_pattern)]


❌ Removed posts mentioning MBTI: 276907 rows in 15.09s
❌ Removed empty posts: 16 rows in 0.75s
❌ Removed short posts (< 10 words): 330251 rows in 3.81s
❌ Removed duplicates: 2262 rows in 0.39s
✅ Normalized post text in 1.06s

📊 Final Summary:
✅ Final cleaned dataset has 1047489 rows
⏱️ Total preprocessing time: 30.18s
