In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string
# data processing, modeling, elavuation
# Logistic Regression model (linear classifier for binary/multiclass tasks)
from sklearn.linear_model import LogisticRegression  
# Random Forest classifier (ensemble of decision trees using bagging + randomness)
from sklearn.ensemble import RandomForestClassifier  
# Tools for splitting data, doing cross-validation, and tuning hyperparameters
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV  
# Multinomial Naive Bayes classifier (good for text data, word counts, frequencies)
from sklearn.naive_bayes import MultinomialNB  
# Metrics for evaluating model predictions (confusion matrix, reports, accuracy)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score  
# Pipeline for chaining preprocessing steps with model training
from sklearn.pipeline import Pipeline  
# XGBoost classifier (boosting algorithm
from xgboost import XGBClassifier  
# LightGBM classifier (gradient boosting framework optimized for speed and large datasets)
from lightgbm import LGBMClassifier  
# CatBoost classifier (gradient boosting library optimized for categorical features)
from catboost import CatBoostClassifier  

In [None]:
import re

In [None]:
from nltk.corpus import stopwords
# text processing, sentiment analysis, and NLP tasks
from textblob import Word, TextBlob
# vader for scoring text positivity/negativity
from nltk.sentiment import SentimentIntensityAnalyzer
# embedding
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# word base roots
from nltk.stem import WordNetLemmatizer
import spacy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sea
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
# fully connected nn layer
from tensorflow.keras.layers import Dense

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# all columns when printing a DataFrame 
pd.set_option('display.max_columns', None)
# Don’t limit line width when displaying DataFrames in the console
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 55)
# Format floating-point numbers to 3 decimal places when displaying
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv('/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv')
df.head()

In [None]:
df.info()

In [None]:
# replace any mising value with empty string
df['statement'] = df['statement'].fillna('')

In [None]:
df.head()

In [None]:
# drop column 9(axis =1 drop column not row)
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [None]:
df['status'].unique()

In [None]:
df[df['status'] == 'Anxiety']['statement'][1]

In [None]:
# Converts text labels → numbers
df['status'] = df['status'].map({'Anxiety': 0, 'Normal':1, 'Depression': 2, 'Suicidal': 3, 'Stress': 4, 'Bipolar': 5, "Personality disorder": 6})

In [None]:
def clean_text(text):
    #normalizing case folding
    text = text.lower() #everything to lower vase
    text = text.replace('\n', " ")# remove newlien
    text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctuations
    text = re.sub(r'\d+', '', text) #remove digits
    text = re.sub(r'\[.*?\]', '', text) # remove parenthesis
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove url
    text = re.sub(r'<.*?>', '', text) #remove html tags
    text = re.sub(r'\w*\d\w*', '', text) #remove words with numbers
    return text.strip()

In [None]:
df['statement'] = df['statement'].apply(clean_text)

In [None]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    #remove stop words in each row
    text = text.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))
    return text

In [None]:
df['statement'] = remove_stopwords(df['statement'])

In [None]:
d = (' '.join(df['statement']).split())
d[:10]

In [None]:
# least common 1000 words
delete = pd.Series(' '.join(df['statement']).split()).value_counts()[-1000:]
# removes least common words from df['statement']
df['statement'] = df['statement'].apply(lambda x: " ".join(x for x in x.split() if x not in delete))

In [None]:
nlp = spacy.load('en_core_web_sm')
def lemmatize_sentence(sentence):
    doc = nlp(sentence) # text to tokens, word to base root
    return " ".join([token.lemma_ for token in doc]) # loop through each token and get base root

In [None]:
f = df['statement'].apply(lemmatize_sentence)

In [None]:
df['statement'] = f
df.head()

In [None]:
df.isnull().sum()
# if its 0 for both statement and status, then there are no None value

In [None]:
y = df['status'] # output
x = df.drop('status', axis=1) # drop output column and axis=1 drops only colums

In [None]:
# splitting data into test and train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train

In [None]:
# vecotrize using tfidf
def tfidf_vec_boost(model, text_col, max_df, max_features, ngram_range, n_estimators, max_depth=3, learning_rate=0.03, verbose = True):
    # textcol = name of columns
    # max_df = max document frequency threshold for tfidf vecotrizer (ignore veyr comon words)
    # max_features: unique words (top most important words)
    # ngram_range = captures context and meaning single words might miss
    # estimators = number of trees built for random forest and number of boosting rounds for boosting algos
    # max_depth = how deep each tree can go (complexity)
    
    # vectorize data
    vectorizer = TfidfVectorizer(max_df=max_df, max_features=max_features, ngram_range=ngram_range)
    x_train_vector = vectorizer.fit_transform(x_train[text_col])
    # initialize and train classifier
    classifier = model(n_estimators=n_estimators, max_depth = max_depth, learning_rate=learning_rate, verbose=verbose)
    classifier.fit(x_train_vector, y_train)
    #transform test data using trained vectorizer
    x_test_vector = vectorizer.transform(x_test[text_col])
    #predict test labels
    y_prediction = classifier.predict(x_test_vector)
    #pritn classification report
    print(classification_report(y_test, y_prediction))
    #compute and plot confusion matrix
    cm = confusion_matrix(y_test, y_prediction)
    labels = y_test.unique().tolist()
    plt.figure(figsize=(8, 6))
    sea.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=labels, yticklabels=labels)
    #cm = 2d confusion matrix
    # annot = values in each matrix cell
    # fmt = d (d for decimal)
    plt.xlabel("Prediction")
    plt.ylabel("True")
    plt.title('Confusion matrix')
    plt.show()

In [None]:
# scikit pipeline
def pipe_boosting(vectorizer, classifier, x_grid, y_grid):
    """
    pipeline = Pipeline([
        ('vectorizer', vectorizer), ('classifier', classifier)
    ])"""
    pipe = Pipeline(
    steps=[('vectorizer', TfidfVectorizer()),
           ('classifier', LGBMClassifier(force_col_wise=True, verbose=1))],
    memory='cache_dir'  # create this folder once)
    hyper_params = {
    'vectorizer__ngram_range': [(1,2)],          # keep (1,3) for later
    'vectorizer__max_df': [0.9, 1.0],
    'vectorizer__min_df': [3, 5],                # drop very rare terms
    'vectorizer__max_features': [1000, 2000],    # cap vocabulary
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.07]}
    # gridsearchcv tries all hyperparamters given
    # cv = cross validation, cv = 5 - Each model is trained on 4 parts and validated on the remaining 1 part, repeated 5 times\
    # n_jobs = #of cores to use (-1 = all cores)
    #grid_search = GridSearchCV(pipeline, hyper_params, cv=3, n_jobs=-1, verbose=1)
    #grid_search.fit(x_grid, y_grid)
    search = RandomizedSearchCV(pipeline, hyper_params, n_iter=12, cv=3, n_jobs=-1, verbose=1, random_state=42)
    search.fit(x_train['statement'], y_train)
    print("Optimal hyperparamters:", search.best_params_)

In [None]:
def predict_new_text(text, c_vectorizer, model):
    #predict category of text using vecotrizer and model
    labels = {
        0: 'Anxiety',
        1: 'Normal',
        2: 'Depression',
        3: 'Suicidal',
        4: 'Stress',
        5: 'Bipolar',
        6: 'Personality disorder'
    }
    # embedding
    text_vec = c_vectorizer.transform([text])
    # predict
    prediction = model.predict(text_vec)[0]
    # map predicted index to label
    prediction_label = labels[prediction]
    return {prediction_label: prediction}

In [None]:
tfidf_vec_boost(XGBClassifier, 'statement', 1.0, 2000, (1, 3), 200, 5, 0.07, verbose=True)

In [None]:
pipe_boosting(vectorizer = TfidfVectorizer(), classifier = LGBMClassifier(force_col_wise=True, verbose=1),
             x_grid = x_train['statement'], y_grid=y_train)

In [None]:
pipe_boosting(
    vectorizer=TfidfVectorizer(), 
    classifier=LGBMClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.07,
        force_col_wise=True,
        verbose=1   # change to 1 if you want progress printed
    ),
    x_grid=x_train['statement'],
    y_grid=y_train
)

In [None]:
import time

# Pick one reasonable config of your pipeline
sample_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, ngram_range=(1,2))),
    ('clf', LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.07, force_col_wise=True, verbose=-1))
])

# Time a single fit
t0 = time.time()
sample_pipeline.fit(x_train['statement'][:5000], y_train[:5000])  # subset to make it faster
elapsed = time.time() - t0

print(f"Single fit on subset took ~{elapsed:.2f} seconds")

# Rough estimate for full gridsearch
total_fits = 42  # from "Fitting 3 folds for each of 14 candidates"
cores = 12       # you said you have 12 cores
approx_time = (elapsed * 42 / cores)

print(f"Estimated total GridSearch time: ~{approx_time:.2f} seconds (~{approx_time/60:.1f} minutes)")


In [None]:
best_params = {'learning_rate': .03, 'n_estimators': 200, 'max_df': 1.0, 'max_features': 2000, 'ngram_range': (1,3)}

In [None]:
tfidf_vec_boost(LGBMClassifier, 'statement', **best_params, verbose=-1)

In [None]:
# vectorize once
vectorizer = TfidfVectorizer(max_df=1.0, max_features=2000, ngram_range=(1,2))
x_train_vector = vectorizer.fit_transform(x_train['statement'])
x_test_vector  = vectorizer.transform(x_test['statement'])

# train model
model = XGBClassifier(**best_params, verbose=True)
model.fit(x_train_vector, y_train)

In [None]:
text = df['statement'].sample(2)
text_index = text.index
print(text.index)
text = text.values[0]
text

In [None]:
predict_new_text(text, vectorizer, model)

In [None]:
df.loc[text_index]['status']