In [1]:
# ==============================================
# 1. IMPORT LIBRARIES
# ==============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("\n--- Loading and Preprocessing Data ---")

# ==============================================
# 2. DATA LOADING AND PREPROCESSING
# ==============================================
try:
    df = pd.read_csv("Mental-Health-Twitter.csv")
    print(f"Original dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'Mental-Health-Twitter.csv' not found.")
    exit()

# Filter out rows where 'post_text' or 'label' might be missing
df.dropna(subset=['post_text', 'label'], inplace=True)
print(f"Dataset shape after dropping NaNs: {df.shape}")

# Ensure 'label' column is integer
df['label'] = df['label'].astype(int)

# Map labels for display later
id2label = {0: "No Depression", 1: "Depression"}

# Define X (Features) and y (Target)
X = df['post_text']
y = df['label']

# ==============================================
# 3. TRAIN / TEST SPLIT
# ==============================================
# Splitting 80% for training and 20% for evaluation (same ratio as your Roberta code)
X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training dataset size: {len(X_train)}")
print(f"Evaluation dataset size: {len(X_eval)}")

# ==============================================
# 4. PIPELINE SETUP (TF-IDF + LOGISTIC REGRESSION)
# ==============================================
# We create a pipeline to bundle the pre-processing and model together.
# 1. TfidfVectorizer: Converts text to numerical vectors based on word frequency.
# 2. LogisticRegression: The classifier.

model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',  # Remove common words like 'the', 'is', 'and'
        max_features=5000,     # Keep only top 5000 most frequent words to reduce noise
        ngram_range=(1, 2)     # Look at unigrams (words) and bigrams (two-word pairs)
    )),
    ('clf', LogisticRegression(
        class_weight='balanced', # Handle imbalance if one class is much larger than the other
        solver='liblinear',      # Good solver for smaller datasets or high dimensionality
        random_state=42
    ))
])

# ==============================================
# 5. MODEL TRAINING
# ==============================================
print(f"\n--- Starting Training (TF-IDF + Logistic Regression) ---")
model_pipeline.fit(X_train, y_train)
print("Training complete.")

# ==============================================
# 6. EVALUATION
# ==============================================
print("\n--- Final Evaluation Results ---")

# Generate predictions on the evaluation set
y_pred = model_pipeline.predict(X_eval)

# Calculate Metrics
accuracy = accuracy_score(y_eval, y_pred)
f1 = f1_score(y_eval, y_pred, average='weighted') # 'binary' if strictly 2 classes, 'weighted' handles general cases

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_eval, y_pred, target_names=["No Depression", "Depression"]))

# ==============================================
# 7. INFERENCE (TESTING ON NEW DATA)
# ==============================================
new_data = [
    "The estate agent is officially pissing me off now...",
    "Ahh. Foundations was fun.",
    "Some people find themselves turning to colleagues to vent their #frustrations.",
    "True love is basically me holding Andy for the first time. I can still remember opening the box like it was just last October."
]

print("\n--- Running Inference on New Data ---")

# The pipeline handles vectorization automatically for new data
predictions = model_pipeline.predict(new_data)
probs = model_pipeline.predict_proba(new_data) # Get confidence scores

for text, pred, prob in zip(new_data, predictions, probs):
    label_name = id2label[pred]
    confidence = prob[pred] # Score of the predicted class
    print(f"\nText: {text}")
    print(f"Prediction: {label_name} (Score: {confidence:.4f})")


--- Loading and Preprocessing Data ---
Original dataset shape: (20000, 11)
Dataset shape after dropping NaNs: (20000, 11)
Training dataset size: 16000
Evaluation dataset size: 4000

--- Starting Training (TF-IDF + Logistic Regression) ---
Training complete.

--- Final Evaluation Results ---
Accuracy: 0.8598
F1 Score: 0.8597

Classification Report:
               precision    recall  f1-score   support

No Depression       0.86      0.86      0.86      2000
   Depression       0.86      0.86      0.86      2000

     accuracy                           0.86      4000
    macro avg       0.86      0.86      0.86      4000
 weighted avg       0.86      0.86      0.86      4000


--- Running Inference on New Data ---

Text: The estate agent is officially pissing me off now...
Prediction: Depression (Score: 0.5764)

Text: Ahh. Foundations was fun.
Prediction: Depression (Score: 0.5346)

Text: Some people find themselves turning to colleagues to vent their #frustrations.
Prediction: No Depre