In [None]:
import pandas as pd
import re
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

#Load spaCy model for tokenization and lemmatization
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

#Define a custom tokenizer function
def custom_tokenizer(text):
    # Basic text cleaning
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I | re.A)
    text = text.lower()
    doc = nlp(text)
    # Lemmatize, lowercase, and remove stopwords and non-alphabetic tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens

data = pd.read_csv('mental_health_conversations.csv')  # Replace with your dataset path
data = data[data["labels"].isin(["depression", "anxiety", "stress", "anger", "addiction"])]

X = data['message']
y = data['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(
        tokenizer=custom_tokenizer,
        max_features=5000
    )),
    ('classifier', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the model using joblib
joblib.dump(pipeline, 'advanced_mental_health_classifier.joblib')
print("Model saved successfully.")
