In [None]:
!pip install --upgrade seaborn
!pip uninstall -y seaborn matplotlib
!pip install seaborn==0.13.2 matplotlib==3.8.4

### Clean Data & Create Buckets & Run Model to Predict Bucket of Score Success of Post

In [148]:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix, accuracy_score


df = pd.read_csv("./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv", low_memory=False)
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df.dropna(subset=["score"]).reset_index(drop=True)

# df_careeradvice['score'].hist(bins=8)
# plt.title('Histogram of score Column') 
# plt.xlabel('score') 
# plt.ylabel('Frequency') 
# plt.show() 

# df_careeradvice['log_score'] = np.log1p(df_careeradvice['score']) 
# df_careeradvice['log_score'].hist(bins=8)
# plt.title('Histogram of log_score Column') 
# plt.xlabel('log_score') 
# plt.ylabel('Frequency') 
# plt.show()

# bins = [0, 20, 150, np.inf] 
# labels = ["Low", "Medium", "High"]
# df["score_bucket"] = pd.cut(df["score"], bins=bins, labels=labels, right=True)

df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df.dropna(subset=["score"]).reset_index(drop=True)

# Try to create 3 balanced buckets
try:
    df["score_bucket"] = pd.qcut(
        df["score"],
        q=3,
        labels=["Low", "Medium", "High"],
        duplicates="drop"
    )

    if df["score_bucket"].nunique() < 3:
        raise ValueError("Fewer than 3 buckets created (duplicate quantile edges)")

except ValueError:
    print("⚠️ Detected duplicate quantile edges — applying small jitter for stability...")
    rng = np.random.default_rng(42)
    df["score_jitter"] = df["score"] + rng.uniform(0, 1e-6, len(df))

    df["score_bucket"] = pd.qcut(
        df["score_jitter"],
        q=3,
        labels=["Low", "Medium", "High"],
        duplicates="drop"
    )

# Check distribution
print("\nBucket distribution:")
print(df["score_bucket"].value_counts(normalize=True).round(3))

# Optional: print the quantile cutoffs
cutoffs = df["score"].quantile([1/3, 2/3])
print("\nScore cutoffs for each bucket:")
print(cutoffs)



# ===== 3) Minimal text cleaning + combine =====
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+|https\S+", " ", t)   # remove URLs
    t = re.sub(r"[^a-z0-9\s]", " ", t)               # keep alnum + space
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["title"] = df["title"].apply(clean_text)
df["text"]  = df["text"].apply(clean_text)
df = df[(df["title"] != "") & (df["text"] != "")].reset_index(drop=True)

df["combined"] = (df["title"] + " " + df["text"]).str.strip()

# ===== 4) Stratified split: 60 / 20 / 20 =====
y = df["score_bucket"]
X = df["combined"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"\nSplit sizes -> Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# ===== 5) Text vectorization (title+text) =====
tfidf = TfidfVectorizer(
    max_features=20000,
    stop_words="english",
    ngram_range=(1,2)
)
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec   = tfidf.transform(X_val)
X_test_vec  = tfidf.transform(X_test)

# ===== 6) Train classifier =====
# Imbalance is OK; if you want a small boost to minority classes, set class_weight="balanced"
clf = LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42)  # class_weight=None (imbalanced)
clf.fit(X_train_vec, y_train)

# ===== 7) Evaluate =====
def evaluate(name, Xv, yv):
    pred = clf.predict(Xv)
    acc = accuracy_score(yv, pred)
    bal = balanced_accuracy_score(yv, pred)
    print(f"\n{name} Results:")
    print(f"Accuracy: {acc:.3f} | Balanced Accuracy: {bal:.3f}")
    print(classification_report(yv, pred))
    cm = confusion_matrix(yv, pred, labels=["Low","Medium","High"])
    cm_df = pd.DataFrame(cm, index=["true_Low","true_Med","true_High"],
                            columns=["pred_Low","pred_Med","pred_High"])
    print("Confusion matrix:\n", cm_df)

evaluate("Validation", X_val_vec, y_val)
evaluate("Test",        X_test_vec, y_test)


⚠️ Detected duplicate quantile edges — applying small jitter for stability...

Bucket distribution:
score_bucket
Low       0.333
Medium    0.333
High      0.333
Name: proportion, dtype: float64

Score cutoffs for each bucket:
0.333333    1.0
0.666667    1.0
Name: score, dtype: float64

Split sizes -> Train: 41106, Val: 13702, Test: 13703


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (



Validation Results:
Accuracy: 0.395 | Balanced Accuracy: 0.395
              precision    recall  f1-score   support

        High       0.43      0.47      0.45      4574
         Low       0.36      0.32      0.34      4563
      Medium       0.40      0.39      0.39      4565

    accuracy                           0.39     13702
   macro avg       0.39      0.39      0.39     13702
weighted avg       0.39      0.39      0.39     13702

Confusion matrix:
            pred_Low  pred_Med  pred_High
true_Low       1481      1527       1555
true_Med       1445      1770       1350
true_High      1239      1175       2160

Test Results:
Accuracy: 0.399 | Balanced Accuracy: 0.399
              precision    recall  f1-score   support

        High       0.44      0.48      0.46      4574
         Low       0.37      0.34      0.35      4563
      Medium       0.39      0.38      0.38      4566

    accuracy                           0.40     13703
   macro avg       0.40      0.40      0.4

In [149]:
import pandas as pd, numpy as np, re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix, accuracy_score
from scipy.sparse import hstack

# ===== 0) Start from your df_careeradvice =====
try:
    df = df_careeradvice.copy()
except NameError:
    df = pd.read_csv("./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv", low_memory=False)

# ===== 1) Clean + labels (3 tertiles on log_score with jitter) =====
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+|https\S+", " ", t)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df = df.dropna(subset=["title","text","score"]).copy()
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df = df.dropna(subset=["score"]).reset_index(drop=True)

df["title"] = df["title"].apply(clean_text)
df["text"]  = df["text"].apply(clean_text)
df = df[(df["title"]!="") & (df["text"]!="")].reset_index(drop=True)

# tertiles on log1p(score) with tiny jitter to avoid duplicate edges
log_s = np.log1p(df["score"].astype(float))
log_s_jit = log_s + np.random.uniform(0, 1e-6, size=len(df))
df["score_bucket"] = pd.qcut(log_s_jit, q=3, labels=["Low","Medium","High"])
y = df["score_bucket"]

# ===== 2) Split =====
X_title, X_text = df["title"], df["text"]
Xtr_t, Xtmp_t, y_tr, y_tmp, Xtr_x, Xtmp_x = train_test_split(
    X_title, y, X_text, test_size=0.40, random_state=42, stratify=y
)
Xva_t, Xte_t, y_va, y_te, Xva_x, Xte_x = train_test_split(
    Xtmp_t, y_tmp, Xtmp_x, test_size=0.50, random_state=42, stratify=y_tmp
)

# ===== 3) Vectorizers =====
# Title: word 1–2 grams (smaller vocab) — will be upweighted
tfidf_title = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=15000)
# Body: word 1–2 grams (larger vocab)
tfidf_body  = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=30000)
# Char n-grams on combined title+text
tfidf_char  = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=5, max_features=20000)

# Fit on train only
Xtr_title = tfidf_title.fit_transform(Xtr_t)
Xtr_body  = tfidf_body.fit_transform(Xtr_x)
Xtr_char  = tfidf_char.fit_transform((Xtr_t + " " + Xtr_x))

# Transform val/test
Xva_title = tfidf_title.transform(Xva_t)
Xva_body  = tfidf_body.transform(Xva_x)
Xva_char  = tfidf_char.transform((Xva_t + " " + Xva_x))
Xte_title = tfidf_title.transform(Xte_t)
Xte_body  = tfidf_body.transform(Xte_x)
Xte_char  = tfidf_char.transform((Xte_t + " " + Xte_x))

# ===== 4) Upweight title features =====
TITLE_WEIGHT = 2.0
Xtr_title = Xtr_title.multiply(TITLE_WEIGHT)
Xva_title = Xva_title.multiply(TITLE_WEIGHT)
Xte_title = Xte_title.multiply(TITLE_WEIGHT)

# Stack: [title_words | body_words | char_ngrams]
from scipy.sparse import csr_matrix
Xtr = hstack([Xtr_title, Xtr_body, Xtr_char], format="csr")
Xva = hstack([Xva_title, Xva_body, Xva_char], format="csr")
Xte = hstack([Xte_title, Xte_body, Xte_char], format="csr")

# Optional: l2 normalize rows (helps SVM a bit)
Xtr = normalize(Xtr, copy=False)
Xva = normalize(Xva, copy=False)
Xte = normalize(Xte, copy=False)

# ===== 5) Model: Linear SVM with class weighting =====
clf = LinearSVC(class_weight="balanced", random_state=42)
clf.fit(Xtr, y_tr)

def evaluate(name, X, y_true):
    pred = clf.predict(X)
    acc  = accuracy_score(y_true, pred)
    bacc = balanced_accuracy_score(y_true, pred)
    print(f"\n{name} — Acc: {acc:.3f} | BalAcc: {bacc:.3f}")
    print(classification_report(y_true, pred, digits=3))
    cm = confusion_matrix(y_true, pred, labels=["Low","Medium","High"])
    print("Confusion matrix:\n", pd.DataFrame(cm,
          index=["true_Low","true_Med","true_High"],
          columns=["pred_Low","pred_Med","pred_High"]))

evaluate("Validation", Xva, y_va)
evaluate("Test",        Xte, y_te)



Validation — Acc: 0.394 | BalAcc: 0.394
              precision    recall  f1-score   support

        High      0.432     0.456     0.444      4568
         Low      0.359     0.328     0.343      4567
      Medium      0.386     0.399     0.392      4567

    accuracy                          0.394     13702
   macro avg      0.392     0.394     0.393     13702
weighted avg      0.392     0.394     0.393     13702

Confusion matrix:
            pred_Low  pred_Med  pred_High
true_Low       1497      1616       1454
true_Med       1469      1821       1277
true_High      1208      1279       2081

Test — Acc: 0.388 | BalAcc: 0.388
              precision    recall  f1-score   support

        High      0.429     0.459     0.443      4567
         Low      0.356     0.332     0.344      4568
      Medium      0.375     0.373     0.374      4568

    accuracy                          0.388     13703
   macro avg      0.387     0.388     0.387     13703
weighted avg      0.387     0.388 

In [152]:
# ================================================================
# r/careeradvice — Predict post "success bucket" (Low/Med/High)
# with sentiment, readability, metadata, and TF-IDF text features
# ================================================================

import pandas as pd, numpy as np, re, warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix, accuracy_score
from scipy.sparse import hstack, csr_matrix
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

warnings.filterwarnings("ignore")
nltk.download("vader_lexicon", quiet=True)
np.random.seed(42)

# ---------- 1) Load data ----------
df = pd.read_csv("./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv", low_memory=False)
df = df[df.get("subreddit", "").str.lower().eq("careeradvice") | ~df.get("subreddit", "").notna()]  # optional
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df["num_comments"] = pd.to_numeric(df.get("num_comments", np.nan), errors="coerce")
df = df.dropna(subset=["score", "title", "text"]).reset_index(drop=True)

# ---------- 2) Clean text ----------
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+|https\S+", " ", t)
    t = re.sub(r"[^a-z0-9\s?!.,:;']", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["title"] = df["title"].apply(clean_text)
df["text"]  = df["text"].apply(clean_text)
df = df[(df["title"] != "") & (df["text"] != "")].reset_index(drop=True)

# ---------- 3) Success buckets (balanced tertiles on log(score)) ----------
df["log_score"] = np.log1p(df["score"])
df["log_score_jit"] = df["log_score"] + np.random.uniform(0, 1e-6, len(df))
df["bucket"] = pd.qcut(df["log_score_jit"], q=3, labels=["Low","Medium","High"], duplicates="drop")
print("Bucket distribution:", df["bucket"].value_counts(normalize=True).round(3).to_dict())

# ---------- 4) Sentiment + simple metadata ----------
sia = SentimentIntensityAnalyzer()
df["sent_title"] = df["title"].apply(lambda t: sia.polarity_scores(t)["compound"])
df["sent_text"]  = df["text"].apply(lambda t: sia.polarity_scores(t)["compound"])
df["title_len"]   = df["title"].str.split().str.len()
df["text_len"]    = df["text"].str.split().str.len()
df["has_question"] = df["title"].str.contains(r"\?").astype(int)
df["exclaims"]     = df["title"].str.count("!")
df["caps_ratio"]   = df["title"].apply(lambda s: sum(ch.isupper() for ch in s) / max(len(s),1))
df["hour"] = 0
df["dow"]  = 0

# ---------- 5) Text for vectorization ----------
df["combined"] = (df["title"] + " " + df["text"]).str.strip()

# ---------- 6) Train/Val/Test split ----------
y = df["bucket"]
df_train, df_temp, y_train, y_temp = train_test_split(df, y, test_size=0.4, random_state=42, stratify=y)
df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"Split sizes -> Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

# ---------- 7) TF-IDF vectorization ----------
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf.fit_transform(df_train["combined"])
X_val_tfidf   = tfidf.transform(df_val["combined"])
X_test_tfidf  = tfidf.transform(df_test["combined"])

# ---------- 8) Numeric features ----------
num_cols = ["sent_title","sent_text","title_len","text_len","has_question","exclaims","caps_ratio","hour","dow"]

# Make sure all numeric values are finite
for c in num_cols:
    df_train[c] = pd.to_numeric(df_train[c], errors="coerce")
    df_val[c]   = pd.to_numeric(df_val[c], errors="coerce")
    df_test[c]  = pd.to_numeric(df_test[c], errors="coerce")

df_train[num_cols] = df_train[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
df_val[num_cols]   = df_val[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
df_test[num_cols]  = df_test[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

scaler = StandardScaler().fit(df_train[num_cols])
X_train_num = scaler.transform(df_train[num_cols])
X_val_num   = scaler.transform(df_val[num_cols])
X_test_num  = scaler.transform(df_test[num_cols])

# Ensure no NaNs left
X_train_num = np.nan_to_num(X_train_num, nan=0.0, posinf=0.0, neginf=0.0)
X_val_num   = np.nan_to_num(X_val_num, nan=0.0, posinf=0.0, neginf=0.0)
X_test_num  = np.nan_to_num(X_test_num, nan=0.0, posinf=0.0, neginf=0.0)

# Combine TF-IDF + numeric
X_train = hstack([X_train_tfidf, csr_matrix(X_train_num)], format="csr")
X_val   = hstack([X_val_tfidf,   csr_matrix(X_val_num)],   format="csr")
X_test  = hstack([X_test_tfidf,  csr_matrix(X_test_num)],  format="csr")

# ---------- 9) Train model ----------
model = LinearSVC(class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# ---------- 10) Evaluate ----------
def evaluate(name, X, y_true):
    preds = model.predict(X)
    acc  = accuracy_score(y_true, preds)
    bacc = balanced_accuracy_score(y_true, preds)
    print(f"\n{name} Results:")
    print(f"Accuracy: {acc:.3f} | Balanced Accuracy: {bacc:.3f}")
    print(classification_report(y_true, preds, digits=3))
    cm = confusion_matrix(y_true, preds, labels=["Low","Medium","High"])
    print("Confusion matrix:\n", pd.DataFrame(
        cm, index=["true_Low","true_Med","true_High"], columns=["pred_Low","pred_Med","pred_High"]
    ))

evaluate("Validation", X_val, y_val)
evaluate("Test", X_test, y_test)


Bucket distribution: {'Low': 0.333, 'Medium': 0.333, 'High': 0.333}
Split sizes -> Train: 41125, Val: 13708, Test: 13709

Validation Results:
Accuracy: 0.383 | Balanced Accuracy: 0.383
              precision    recall  f1-score   support

        High      0.418     0.439     0.428      4570
         Low      0.353     0.343     0.348      4569
      Medium      0.377     0.369     0.373      4569

    accuracy                          0.383     13708
   macro avg      0.382     0.383     0.383     13708
weighted avg      0.382     0.383     0.383     13708

Confusion matrix:
            pred_Low  pred_Med  pred_High
true_Low       1565      1552       1452
true_Med       1539      1684       1346
true_High      1328      1236       2006

Test Results:
Accuracy: 0.375 | Balanced Accuracy: 0.375
              precision    recall  f1-score   support

        High      0.405     0.421     0.413      4569
         Low      0.357     0.343     0.350      4570
      Medium      0.362     0.

In [154]:
import pandas as pd
import numpy as np
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix, accuracy_score
from scipy.sparse import hstack, csr_matrix
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

warnings.filterwarnings("ignore")
nltk.download("vader_lexicon", quiet=True)
np.random.seed(42)

df = pd.read_csv("./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv", low_memory=False)
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df["num_comments"] = pd.to_numeric(df.get("num_comments", np.nan), errors="coerce")
df = df.dropna(subset=["score", "title", "text"]).reset_index(drop=True)

# Simple text cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"[^a-z0-9\s?!.,:;']", " ", text)
    return re.sub(r"\s+", " ", text).strip()

df["title"] = df["title"].apply(clean_text)
df["text"] = df["text"].apply(clean_text)
df = df[(df["title"] != "") & (df["text"] != "")].reset_index(drop=True)


df["log_score"] = np.log1p(df["score"])
df["log_score_jitter"] = df["log_score"] + np.random.uniform(0, 1e-6, len(df))
df["bucket"] = pd.qcut(df["log_score_jitter"], q=3, labels=["Low", "Medium", "High"], duplicates="drop")

print("Bucket distribution:")
print(df["bucket"].value_counts(normalize=True).round(3))

# ------------------------------------------------
# 3. Add sentiment + basic metadata
# ------------------------------------------------
sia = SentimentIntensityAnalyzer()
df["sent_title"] = df["title"].apply(lambda t: sia.polarity_scores(t)["compound"])
df["sent_text"] = df["text"].apply(lambda t: sia.polarity_scores(t)["compound"])
df["title_len"] = df["title"].str.split().str.len()
df["text_len"] = df["text"].str.split().str.len()
df["has_question"] = df["title"].str.contains(r"\?").astype(int)
df["exclaims"] = df["title"].str.count("!")
df["caps_ratio"] = df["title"].apply(lambda s: sum(ch.isupper() for ch in s) / max(len(s), 1))
df["combined"] = (df["title"] + " " + df["text"]).str.strip()
y = df["bucket"]
df_train, df_temp, y_train, y_temp = train_test_split(df, y, test_size=0.4, random_state=42, stratify=y)
df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"\nSplit sizes: train={len(df_train)}, val={len(df_val)}, test={len(df_test)}")
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_features=20000)
X_train_tfidf = tfidf.fit_transform(df_train["combined"])
X_val_tfidf = tfidf.transform(df_val["combined"])
X_test_tfidf = tfidf.transform(df_test["combined"])
num_cols = ["sent_title", "sent_text", "title_len", "text_len", "has_question", "exclaims", "caps_ratio"]

for c in num_cols:
    df_train[c] = pd.to_numeric(df_train[c], errors="coerce")
    df_val[c] = pd.to_numeric(df_val[c], errors="coerce")
    df_test[c] = pd.to_numeric(df_test[c], errors="coerce")

for d in [df_train, df_val, df_test]:
    d[num_cols] = d[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

scaler = StandardScaler().fit(df_train[num_cols])
X_train_num = scaler.transform(df_train[num_cols])
X_val_num = scaler.transform(df_val[num_cols])
X_test_num = scaler.transform(df_test[num_cols])

X_train = hstack([X_train_tfidf, csr_matrix(X_train_num)], format="csr")
X_val = hstack([X_val_tfidf, csr_matrix(X_val_num)], format="csr")
X_test = hstack([X_test_tfidf, csr_matrix(X_test_num)], format="csr")

model = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
model.fit(X_train, y_train)


def evaluate(name, X, y_true):
    preds = model.predict(X)
    acc = accuracy_score(y_true, preds)
    bacc = balanced_accuracy_score(y_true, preds)
    print(f"\n{name} Results")
    print(f"Accuracy: {acc:.3f} | Balanced Accuracy: {bacc:.3f}")
    print(classification_report(y_true, preds, digits=3))
    cm = confusion_matrix(y_true, preds, labels=["Low", "Medium", "High"])
    print(pd.DataFrame(cm,
        index=["true_Low", "true_Med", "true_High"],
        columns=["pred_Low", "pred_Med", "pred_High"]
    ))

evaluate("Validation", X_val, y_val)
evaluate("Test", X_test, y_test)


Bucket distribution:
bucket
Low       0.333
High      0.333
Medium    0.333
Name: proportion, dtype: float64

Split sizes: train=41127, val=13709, test=13710

Validation Results
Accuracy: 0.402 | Balanced Accuracy: 0.402
              precision    recall  f1-score   support

        High      0.445     0.495     0.468      4570
         Low      0.361     0.321     0.340      4570
      Medium      0.391     0.390     0.390      4569

    accuracy                          0.402     13709
   macro avg      0.399     0.402     0.399     13709
weighted avg      0.399     0.402     0.399     13709

           pred_Low  pred_Med  pred_High
true_Low       1467      1629       1474
true_Med       1438      1780       1351
true_High      1160      1149       2261

Test Results
Accuracy: 0.404 | Balanced Accuracy: 0.404
              precision    recall  f1-score   support

        High      0.438     0.486     0.461      4570
         Low      0.369     0.334     0.350      4570
      Medium  

In [153]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
import numpy as np

# Use pretrained model (fast & small)
bert = SentenceTransformer("all-MiniLM-L6-v2")

# Encode text
X_train_emb = bert.encode(df_train["combined"].tolist(), show_progress_bar=True)
X_val_emb   = bert.encode(df_val["combined"].tolist(), show_progress_bar=True)
X_test_emb  = bert.encode(df_test["combined"].tolist(), show_progress_bar=True)

# Train logistic regression or SVM on these
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_emb, y_train)

preds = clf.predict(X_test_emb)
print(classification_report(y_test, preds))

Batches: 100%|██████████| 1286/1286 [32:25<00:00,  1.51s/it]   
Batches: 100%|██████████| 429/429 [11:44<00:00,  1.64s/it]
Batches: 100%|██████████| 429/429 [13:03<00:00,  1.83s/it]


              precision    recall  f1-score   support

        High       0.45      0.56      0.50      4569
         Low       0.37      0.29      0.33      4570
      Medium       0.40      0.39      0.40      4570

    accuracy                           0.41     13709
   macro avg       0.41      0.41      0.41     13709
weighted avg       0.41      0.41      0.41     13709

