In [1]:
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import datetime as dt
from sklearn.svm import SVC


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the original dataset
df = pd.read_csv('missom_coded_v1_santosh.csv')
df = df.rename(columns={'label_minority_stress': 'label'})
df = df[['text', 'label']]

empty_cells =  df.isnull().sum()
print(empty_cells)
df.dropna(inplace = True)
empty_cells =  df.isnull().sum()
print(empty_cells)



text     0
label    0
dtype: int64
text     0
label    0
dtype: int64


In [3]:
X = df["text"]
y= df["label"]

In [4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=42)

# Perform train-test-validation split with stratified sampling
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)

train_ratio = 0.65
valid_ratio = 0.15
test_ratio = 0.20

# Perform train-test-validation split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=valid_ratio/(train_ratio+valid_ratio), stratify=y_train, random_state=42)

In [5]:
X_train.shape, X_test.shape,X_val.shape, y_train.shape, y_test.shape,  y_val.shape

((3762,), (1158,), (869,), (3762,), (1158,), (869,))

In [6]:
# Create feature vectors
vectorizer = TfidfVectorizer()
X_train_tf_idf = vectorizer.fit_transform(X_train)
X_test_tf_idf = vectorizer.transform(X_test)
X_val_tf_idf = vectorizer.transform(X_val)

In [7]:
X_test_tf_idf.toarray().shape, X_train_tf_idf.toarray().shape

((1158, 16277), (3762, 16277))

In [8]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,00,000,00124,00am,00pm,00s,01,0104,017,03,...,𝚜𝚘,𝚝𝚑𝚊𝚝,𝚝𝚑𝚎𝚜𝚎,𝚝𝚑𝚒𝚗𝚔,𝚝𝚑𝚒𝚜,𝚟𝚎𝚛𝚢,𝚠𝚎𝚛𝚎,𝚠𝚑𝚢,𝚠𝚒𝚕𝚕,𝚢𝚘𝚞
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Model Reports
from sklearn.metrics import classification_report,accuracy_score
def report(y_true, y_pred, labels):
    rep = classification_report(y_true=y_true, y_pred=y_pred)
    return (f'Classification Report:\n{rep}')

In [10]:
#1.SVM
start=dt.datetime.now()
svc_model = SVC(kernel='linear')
svc_model.fit(X_train_tf_idf, y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = svc_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

test_predictions = svc_model.predict(X_test_tf_idf)
print(report(y_test, test_predictions, svc_model.classes_ ))

Elapsed time:  0:00:13.910785
Validation Accuracy: 0.8331415420023015
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.95      0.89       910
           1       0.63      0.31      0.42       248

    accuracy                           0.81      1158
   macro avg       0.73      0.63      0.65      1158
weighted avg       0.79      0.81      0.79      1158



In [11]:
#2.Logestic Regression
start=dt.datetime.now()
lr_model = LogisticRegression()
lr_model.fit(X_train_tf_idf, y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = lr_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

test_predictions = lr_model.predict(X_test_tf_idf)
print(report(y_test, test_predictions, lr_model.classes_ ))


Elapsed time:  0:00:00.502030
Validation Accuracy: 0.8193325661680092
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.96      0.89       910
           1       0.65      0.25      0.36       248

    accuracy                           0.81      1158
   macro avg       0.74      0.61      0.62      1158
weighted avg       0.79      0.81      0.78      1158



In [12]:
#3.Random forest
start=dt.datetime.now()
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tf_idf, y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = rf_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

test_predictions = rf_model.predict(X_test_tf_idf)
print(report(y_test, test_predictions, rf_model.classes_ ))


Elapsed time:  0:00:03.060959
Validation Accuracy: 0.7894131185270425
Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       910
           1       0.75      0.02      0.05       248

    accuracy                           0.79      1158
   macro avg       0.77      0.51      0.46      1158
weighted avg       0.78      0.79      0.70      1158



In [13]:
#4. Gaussain NB
start=dt.datetime.now()
mnb_model = MultinomialNB()

mnb_model.fit(X_train_tf_idf,y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = mnb_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

# Testing Accuracy
y_pred = mnb_model.predict(X_test_tf_idf)
print(report(y_test, y_pred,mnb_model.classes_))



Elapsed time:  0:00:00.014334
Validation Accuracy: 0.7848101265822784
Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       910
           1       0.00      0.00      0.00       248

    accuracy                           0.79      1158
   macro avg       0.39      0.50      0.44      1158
weighted avg       0.62      0.79      0.69      1158



In [14]:
#5. AdaBoostClassifier
start=dt.datetime.now()
ab_model = AdaBoostClassifier()
ab_model.fit(X_train_tf_idf, y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = ab_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

test_predictions = ab_model.predict(X_test_tf_idf)
print(report(y_test, test_predictions, ab_model.classes_ ))


Elapsed time:  0:00:03.499422
Validation Accuracy: 0.8147295742232451
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       910
           1       0.52      0.39      0.45       248

    accuracy                           0.79      1158
   macro avg       0.68      0.65      0.66      1158
weighted avg       0.78      0.79      0.78      1158



In [15]:
#6. MLP
from sklearn.neural_network import MLPClassifier
start=dt.datetime.now()
mlp_model = MLPClassifier()
mlp_model.fit(X_train_tf_idf, y_train)
print('Elapsed time: ',str(dt.datetime.now()-start))

# Evaluate the accuracy on the validation data
y_pred_val = mlp_model.predict(X_val_tf_idf)
accuracy_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", accuracy_val)

test_predictions = mlp_model.predict(X_test_tf_idf)
print(report(y_test, test_predictions, mlp_model.classes_ ))

Elapsed time:  0:01:25.270509
Validation Accuracy: 0.8170310701956272
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       910
           1       0.53      0.38      0.44       248

    accuracy                           0.80      1158
   macro avg       0.69      0.65      0.66      1158
weighted avg       0.78      0.80      0.78      1158

