# Imports

In [2]:
import string

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB # complement naive bias is better for unblanced data

# data

In [6]:
data_df = pd.read_csv('../data/Combined Data.csv', index_col=0)
data_df = data_df.dropna(axis=0).reset_index(drop=True)
data_df = data_df.drop_duplicates().reset_index(drop=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51093 entries, 0 to 51092
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  51093 non-null  object
 1   status     51093 non-null  object
dtypes: object(2)
memory usage: 798.5+ KB


# Data preprocessing

In [7]:
def preprocessing(sentence):
    # Removing whitespaces
    sentence = sentence.strip()

    # Lowercasing
    sentence = sentence.lower()

    # Removing numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # Removing punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')

    # Tokenizing
    tokenized = word_tokenize(sentence)

    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    cleaned_sentence = " ".join(lemmatized)
    return cleaned_sentence
data_df['clean_statement'] = data_df['statement'].apply(preprocessing)
data_df.head()

Unnamed: 0,statement,status,clean_statement
0,oh my gosh,Anxiety,oh my gosh
1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,all wrong back off dear forward doubt stay in ...
3,I've shifted my focus to something else but I'...,Anxiety,ive shifted my focus to something else but im ...
4,"I'm restless and restless, it's been a month n...",Anxiety,im restless and restless it been a month now b...


# test train

In [8]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    data_df["clean_statement"], data_df["status"], test_size=test_size, stratify=data_df["status"], random_state=42
)

# preprocessing pipeline

In [9]:
pipe_unscaledX = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
])
pipe_scaledX = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('scaler',StandardScaler(with_mean=False))
])

# modeling

## possible candidates

In [10]:
models = {'log_clf':LogisticRegression(random_state=42, max_iter=1000),
                           'RF_clf': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=20),
                           #'svm_clf':SVC(kernel='sigmoid',class_weight='balanced',max_iter=200),
                           'svm_clf': SVC(kernel='linear',class_weight='balanced',max_iter=1000),
                           'C_NB_clf':ComplementNB(alpha=1)

}
param_grids = {
        'log_clf': {
            'classifier__C': [0.1, 1.0, 10.0],
            'classifier__max_iter': [1000, 2000],
            'preprocessing__vectorizer__max_features': [5000, 8000, 10000]
        },
        'RF_clf': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [10, 20, None],
            'preprocessing__vectorizer__max_features': [5000, 8000, 10000]
        },
        'C_NB_clf': {
            'classifier__alpha': [0.1, 1, 5.0, 10.0],
            'preprocessing__vectorizer__max_features': [5000, 8000, 10000]
        },
        'svm_clf': {
            'classifier__C': [0.1, 1.0, 5,0, 10.0],
            'classifier__max_iter': [1000, 2000]
        }
    }


### grid searching over all model candidates

#### saving best model

In [11]:
from datetime import datetime
import pickle
def save_model(grid_search_result,model_name, X_test, y_test):
    """Save the best model in pickle file, also save the classification report"""

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # save model
    best_model = grid_search_result.best_estimator_
    best_hyperparameter = grid_search_result.best_params_
    model_file_name = timestamp + '_' + model_name +'_model' + '.pkl'
    parameter_file_name = timestamp + '_' + model_name+'_hyperparameters' + '.pkl'
    result_file_name = timestamp + '_' + model_name+'Sresult' + '.pkl'

    with open(model_file_name,'wb') as f:
        pickle.dump(best_model,f)
    with open(parameter_file_name,'wb') as f:
        pickle.dump(best_hyperparameter,f)

    # Save full result
    y_pred = best_model.predict(X_test)
    result = classification_report(y_test, y_pred)
    with open(result_file_name, 'wb') as f:
        pickle.dump(result, f)

In [12]:
fold  = StratifiedKFold(3)
best_models_hyperparameters=dict()
best_models_f1=dict()

for (model_name,clf) in models.items():
    print(model_name)
    #make pipeline
    if model_name == 'svm_clf':
        pipe = Pipeline([('preprocessing',pipe_scaledX),
                     ('classifier',models[model_name])])
    else:
        pipe = Pipeline([('preprocessing',pipe_unscaledX),
                     ('classifier',models[model_name])])

    #grid search
    grid_search = GridSearchCV(
        pipe,
        param_grid=param_grids[model_name],
        cv=fold,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1)
    #fit
    grid_search.fit(X_train,y_train)

    #evaluate model
    best_model =  grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(classification_report(y_true=y_test,y_pred=y_pred))

    #calculate f1 score
    f1_weighted = f1_score(y_test, y_pred, average='weighted')

    # save result in current session
    best_models_hyperparameters[model_name] = grid_search.best_params_
    best_models_f1[model_name] = f1_weighted

    #save pickle files
    save_model(grid_search,model_name,X_test,y_test)


log_clf
Fitting 3 folds for each of 18 candidates, totalling 54 fits
                      precision    recall  f1-score   support

             Anxiety       0.78      0.71      0.74       725
             Bipolar       0.86      0.63      0.73       500
          Depression       0.68      0.73      0.70      3019
              Normal       0.86      0.97      0.91      3208
Personality disorder       0.83      0.27      0.41       179
              Stress       0.67      0.41      0.51       459
            Suicidal       0.70      0.65      0.67      2129

            accuracy                           0.76     10219
           macro avg       0.77      0.62      0.67     10219
        weighted avg       0.76      0.76      0.75     10219

RF_clf
Fitting 3 folds for each of 18 candidates, totalling 54 fits




                      precision    recall  f1-score   support

             Anxiety       0.81      0.48      0.60       725
             Bipolar       0.92      0.32      0.47       500
          Depression       0.56      0.80      0.66      3019
              Normal       0.81      0.96      0.88      3208
Personality disorder       1.00      0.06      0.11       179
              Stress       0.84      0.07      0.13       459
            Suicidal       0.72      0.50      0.59      2129

            accuracy                           0.69     10219
           macro avg       0.81      0.45      0.49     10219
        weighted avg       0.73      0.69      0.67     10219

svm_clf
Fitting 3 folds for each of 10 candidates, totalling 30 fits


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/home/sanju/.pyenv/versions/3.10.6/envs/venv_stress_sense1/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/sanju/.pyenv/versions/3.10.6/envs/venv_stress_sense1/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/sanju/.pyenv/versions/3.10.6/envs/venv_stress_sense1/lib/python3.10/site-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_par

                      precision    recall  f1-score   support

             Anxiety       0.62      0.69      0.65       725
             Bipolar       0.64      0.64      0.64       500
          Depression       0.56      0.61      0.59      3019
              Normal       0.87      0.82      0.85      3208
Personality disorder       0.55      0.35      0.43       179
              Stress       0.43      0.39      0.41       459
            Suicidal       0.53      0.51      0.52      2129

            accuracy                           0.65     10219
           macro avg       0.60      0.57      0.58     10219
        weighted avg       0.65      0.65      0.65     10219

C_NB_clf
Fitting 3 folds for each of 12 candidates, totalling 36 fits
                      precision    recall  f1-score   support

             Anxiety       0.48      0.81      0.60       725
             Bipolar       0.64      0.67      0.65       500
          Depression       0.70      0.55      0.62      3