In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [13]:
df = pd.read_csv("../mental-health-chatbot-and-sentiment-analysis/Dataset/dataframe_cleaned.csv")

In [17]:
processed_df = df["cleaned_statement"]
sentiment = df["status"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(processed_df, sentiment, test_size = 0.2, random_state = 0)

In [19]:
print(f"X_train size: {len(X_train)}")
print(f"X_test size: {len(X_test)}")
print(f"y_train size: {len(y_train)}")
print(f"y_test size: {len(y_test)}")

X_train size: 40732
X_test size: 10183
y_train size: 40732
y_test size: 10183


In [20]:
vectoriser = TfidfVectorizer(ngram_range=(1,3), max_features=500000)
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

Vectoriser fitted.
No. of feature_words:  500000


In [21]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [11]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}

bnb = BernoulliNB()

grid_search = GridSearchCV(estimator=bnb, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

best_bnb = grid_search.best_estimator_
best_bnb.fit(X_train, y_train)

y_pred_best_bnb = best_bnb.predict(X_test)

print("Tuned Bernoulli Naive Bayes")
print(classification_report(y_test, y_pred_best_bnb))

Best Parameters: {'alpha': 0.1}
Best Score: 0.5982273912294922
Tuned Bernoulli Naive Bayes
                      precision    recall  f1-score   support

             Anxiety       0.75      0.55      0.64       727
             Bipolar       0.81      0.42      0.56       506
          Depression       0.61      0.51      0.55      2997
              Normal       0.61      0.96      0.74      3128
Personality disorder       0.72      0.09      0.17       193
              Stress       0.54      0.20      0.29       458
            Suicidal       0.58      0.47      0.52      2174

            accuracy                           0.62     10183
           macro avg       0.66      0.46      0.49     10183
        weighted avg       0.62      0.62      0.59     10183



In [12]:
scaler = StandardScaler(with_mean = False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)

y_test_encoded = label_encoder.transform(y_test)


In [14]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

kn = KNeighborsClassifier()

grid_search = GridSearchCV(kn, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_encoded)

print("Best parameters found: ", grid_search.best_params_)

best_kn = grid_search.best_estimator_
y_pred_encoded = best_kn.predict(X_test_scaled)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

print("Classification report for KNeighborsClassifier")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found:  {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Classification report for KNeighborsClassifier
                      precision    recall  f1-score   support

             Anxiety       0.68      0.05      0.10       727
             Bipolar       0.00      0.00      0.00       506
          Depression       0.67      0.02      0.04      2997
              Normal       0.31      0.99      0.47      3128
Personality disorder       1.00      0.01      0.01       193
              Stress       0.78      0.02      0.03       458
            Suicidal       0.63      0.03      0.06      2174

            accuracy                           0.32     10183
           macro avg       0.58      0.16      0.10     10183
        weighted avg       0.53      0.32      0.18     10183



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
scores = cross_val_score(best_kn, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {scores}")
print(f"Mean cross-validation accuracy: {scores.mean()}")

Cross-validation accuracy scores: [0.3230637  0.32527311 0.32568132 0.32568132 0.32408544]
Mean cross-validation accuracy: 0.32475697675396276


In [22]:
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(X_train,y_train)

y_pred=rf.predict(X_test)

print(classification_report(y_test,y_pred))

                      precision    recall  f1-score   support

             Anxiety       0.92      0.42      0.58       727
             Bipolar       0.94      0.22      0.36       506
          Depression       0.55      0.73      0.63      2997
              Normal       0.70      0.97      0.81      3128
Personality disorder       1.00      0.01      0.02       193
              Stress       0.89      0.02      0.03       458
            Suicidal       0.69      0.44      0.54      2174

            accuracy                           0.65     10183
           macro avg       0.81      0.40      0.42     10183
        weighted avg       0.69      0.65      0.61     10183



In [23]:
joblib.dump(rf, "random_forest_model.pkl")
joblib.dump(vectoriser, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']