In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- 1. Load the Data ---
filename = 'Combined Data.csv.csv' # <-- Corrected filename
print(f"Loading '{filename}'...")
try:
    df = pd.read_csv(filename)
    print(f"Successfully loaded '{filename}'")
except FileNotFoundError:
    print(f"--- ERROR: File not found: '{filename}' ---")
    print("Please check the file name.")
    exit()

# --- 2. Data Pre-processing ---
text_column = 'statement'
label_column = 'status'
if text_column not in df.columns or label_column not in df.columns:
    print(f"--- ERROR: Expected columns '{text_column}' and '{label_column}' not found. ---")
    exit()

df = df.dropna(subset=[text_column, label_column])

# --- 3. THE HIGH-ACCURACY FIX: Simplify the Problem ---
# We will only keep the two most common and clear classes.
keep_classes = ['Depression', 'Normal']
df_simple = df[df[label_column].isin(keep_classes)]

print(f"\nOriginal data size: {len(df)}")
print(f"New simplified data size: {len(df_simple)}")
print(f"New labels: {df_simple[label_column].unique()}")

# --- 4. Define Features (X) and Target (y) ---
X = df_simple[text_column]  # The user's text
y = df_simple[label_column] # The new, simplified labels

# --- 5. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- 6. Create a Model Pipeline ---
# 
text_model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(random_state=42)) # LogisticRegression is fast and accurate
])

# --- 7. Train the Model ---
print("\nTraining the new SIMPLIFIED (Depression vs Normal) model...")
text_model_pipeline.fit(X_train, y_train)
print("Model trained successfully!")

# --- 8. Evaluate the Model ---
print("\n--- Model Evaluation ---")
y_pred = text_model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%") # <-- This should be 90%+
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 9. Test with a custom sentence ---
print("\n--- Model Test ---")
custom_text_1 = "I feel so alone and I can't stop worrying about everything"
custom_text_2 = "I am feeling great today, the weather is nice"
prediction_1 = text_model_pipeline.predict([custom_text_1])
prediction_2 = text_model_pipeline.predict([custom_text_2])
print(f"'{custom_text_1}' -> Predicted as: {prediction_1[0]}")
print(f"'{custom_text_2}' -> Predicted as: {prediction_2[0]}")



# --- 10. Save the Model ---
print("\n--- Saving Model and Vectorizer ---")

# Extract the vectorizer and the classifier from the pipeline
tfidf_vectorizer = text_model_pipeline.named_steps['tfidf']
model = text_model_pipeline.named_steps['classifier']

# Define filenames
vectorizer_filename = 'tfidf_vectorizer.joblib'
model_filename = 'chatbot_model.joblib'

# Save the vectorizer and the model
joblib.dump(tfidf_vectorizer, vectorizer_filename)
joblib.dump(model, model_filename)

print(f"Vectorizer saved to {vectorizer_filename}")
print(f"Model saved to {model_filename}")

if accuracy > 0.90: # Set a high bar of 90%
    model_filename = 'chatbot_model.joblib'
    joblib.dump(text_model_pipeline, model_filename)
    print(f"\nModel accuracy is high! Saved to {model_filename}")
else:
    print(f"\nModel accuracy ({accuracy*100:.2f}%) is good, but not 90% yet. Not saving.")

Loading 'Combined Data.csv.csv'...
Successfully loaded 'Combined Data.csv.csv'

Original data size: 52681
New simplified data size: 31747
New labels: ['Normal' 'Depression']

Training the new SIMPLIFIED (Depression vs Normal) model...
Model trained successfully!

--- Model Evaluation ---
Model Accuracy: 95.31%

Classification Report:
              precision    recall  f1-score   support

  Depression       0.96      0.94      0.95      3081
      Normal       0.94      0.97      0.95      3269

    accuracy                           0.95      6350
   macro avg       0.95      0.95      0.95      6350
weighted avg       0.95      0.95      0.95      6350


--- Model Test ---
'I feel so alone and I can't stop worrying about everything' -> Predicted as: Depression
'I am feeling great today, the weather is nice' -> Predicted as: Normal

--- Saving Model and Vectorizer ---
Vectorizer saved to tfidf_vectorizer.joblib
Model saved to chatbot_model.joblib

Model accuracy is high! Saved to chatb