# **Prepare Dataset**

In [None]:
!pip install -q kaggle

In [1]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d "suchintikasarkar/sentiment-analysis-for-mental-health"

Dataset URL: https://www.kaggle.com/datasets/suchintikasarkar/sentiment-analysis-for-mental-health
License(s): DbCL-1.0
Downloading sentiment-analysis-for-mental-health.zip to /content
 81% 9.00M/11.1M [00:00<00:00, 28.5MB/s]
100% 11.1M/11.1M [00:00<00:00, 33.0MB/s]


In [2]:
import zipfile

dataset_zip = zipfile.ZipFile("/content/sentiment-analysis-for-mental-health.zip", "r")
dataset_zip.extractall()
dataset_zip.close()

# **Read and Define Data to Variable**

In [3]:
import pandas as pd
raw_data = pd.read_csv("/content/Combined Data.csv")

# Local computing
# raw_data = pd.read_csv("../data/sentiments.csv")

In [4]:
df = raw_data.dropna()
print("Total data (rows):", len(df), "\n")

df.sample(frac = 1).head()

Total data (rows): 52681 



Unnamed: 0.1,Unnamed: 0,statement,status
24856,24856,she found it on accident and could not help bu...,Depression
2224,2224,"for once, this girl of yours said spicy words,...",Normal
34826,34826,3 month old corn dog I posted this in r/foodpo...,Anxiety
41491,41491,educational depression,Depression
38952,38952,a friend of mine gave me feedback about someth...,Depression


# **Label Encoding**

In [5]:
classes = df['status'].unique()
print("Total classes/ labels:", len(classes), "\n")
classes

Total classes/ labels: 7 



array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [6]:
import numpy as np
import pandas as pd

def to_class_num(class_name):
    class_list_num = np.where(classes == class_name)[0][0]
    return class_list_num

def to_class_name(class_num):
    return classes[class_num]

classes_pd = pd.DataFrame({
    "Class Name" : classes,
    "Value" : [to_class_num(class_name) for class_name in classes]
})
display(classes_pd)

Unnamed: 0,Class Name,Value
0,Anxiety,0
1,Normal,1
2,Depression,2
3,Suicidal,3
4,Stress,4
5,Bipolar,5
6,Personality disorder,6


# **Train-Test Split & Preprocessing Dataset**

In [7]:
from sklearn.model_selection import train_test_split

x_data = df['statement']
y_data = df['status']

x_train, x_test, y_train, y_test = train_test_split(x_data.values, y_data.values)
print(len(x_train), len(y_train), len(x_test), len(y_test))

39510 39510 13171 13171


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token='UNK', lower = True)
tokenizer.fit_on_texts(x_data.values)

x_train_tokenized = tokenizer.texts_to_sequences(x_train)
x_test_tokenized = tokenizer.texts_to_sequences(x_test)

In [9]:
max_len = max([len(x) for x in x_train_tokenized])
print(max_len)

6300


In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_tokenized_padded = pad_sequences(x_train_tokenized, maxlen = max_len)
x_test_tokenized_padded = pad_sequences(x_test_tokenized, maxlen = max_len)

x_train_tokenized_padded[0]

array([  0,   0,   0, ...,   2,  47, 205], dtype=int32)

In [11]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

63341

# **Defining Device and Transforming Dataset**

In [12]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Device:", device, "\n")

Device: cuda:0 



In [13]:
from torch.utils.data import Dataset

class CustomizedDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        x = torch.from_numpy(self.x[idx])
        y = torch.tensor(to_class_num(self.y[idx]))
        return x, y

sample = CustomizedDataset(x_train_tokenized_padded, y_train)
sample[0]

(tensor([  0,   0,   0,  ...,   2,  47, 205], dtype=torch.int32), tensor(2))

# **Batch Data Loading**

In [14]:
from torch.utils.data import DataLoader

batch_size = 32
train_dataloader = DataLoader(CustomizedDataset(x_train_tokenized_padded, y_train), shuffle = True, batch_size = batch_size)
test_dataloader = DataLoader(CustomizedDataset(x_test_tokenized_padded, y_test), shuffle = True, batch_size = batch_size)

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f21473dfac0>

# **Define & Build Model**

In [None]:
import torch.nn as nn

num_lstm = 1
num_hidden = 64
embedding_size = 64

softmax = nn.Softmax(dim=1)

class SentimentAnalysisModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embd = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.lstm = nn.LSTM(embedding_size, num_hidden, bidirectional = True, num_layers=num_lstm)
        self.linear = nn.Linear(2 * num_hidden * max_len, len(classes))

    def forward(self, x):
        logits = self.embd(x)
        logits , (h_n, c_n) = self.lstm(logits)
        logits = logits.flatten(start_dim = 1, end_dim=-1)
        logits = self.linear(logits)
        probability = softmax(logits)
        return logits, probability

In [None]:
model = SentimentAnalysisModel()
model.eval()

SentimentAnalysisModel(
  (embd): Embedding(63341, 64, padding_idx=0)
  (lstm): LSTM(64, 64, bidirectional=True)
  (linear): Linear(in_features=806400, out_features=7, bias=True)
)

In [16]:
test = x_train_tokenized_padded[0]
test = torch.from_numpy(test[None, :])
print(test.shape)
pred, prob = model(test)
print(pred.shape)

torch.Size([1, 6300])
torch.Size([1, 7])


In [17]:
from tqdm import tqdm
import torch
import torch.nn as nn

cross_entropy = nn.CrossEntropyLoss()   # criterion
model = model.to(device)
optimizer = torch.optim.Adagrad(model.parameters(), lr=1e-2, weight_decay=1e-4)
epochs = 32

avg_loss = []
avg_acc = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    acc = 0
    loss_acc = 0
    n = 0

    for x, y in tqdm(train_dataloader):
        x, y = x.to(device), y.to(device)

        pred, prob = model(x)

        loss = cross_entropy(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = pred.argmax(axis=1)

        acc += (output == y).sum().item()
        n += y.shape[0]
        loss_acc += loss.item()

    avg_loss.append(loss_acc / len(train_dataloader))
    avg_acc.append(acc / n)
    print('avg loss:', loss_acc / len(train_dataloader))
    print('avg acc :', acc / n)

Epoch 1/32


100%|██████████| 1235/1235 [01:00<00:00, 20.31it/s]


avg loss: 2.4962511766777347
avg acc : 0.5255125284738041
Epoch 2/32


100%|██████████| 1235/1235 [01:01<00:00, 19.99it/s]


avg loss: 0.8402693949971605
avg acc : 0.6854720323968616
Epoch 3/32


100%|██████████| 1235/1235 [01:03<00:00, 19.57it/s]


avg loss: 0.7074310944871864
avg acc : 0.7455074664641863
Epoch 4/32


100%|██████████| 1235/1235 [01:03<00:00, 19.30it/s]


avg loss: 0.6336848480740057
avg acc : 0.7773728170083524
Epoch 5/32


100%|██████████| 1235/1235 [01:04<00:00, 19.11it/s]


avg loss: 0.5758404076582024
avg acc : 0.8026828650974437
Epoch 6/32


100%|██████████| 1235/1235 [01:05<00:00, 18.83it/s]


avg loss: 0.5291206498619033
avg acc : 0.8197418375094913
Epoch 7/32


100%|██████████| 1235/1235 [01:04<00:00, 19.00it/s]


avg loss: 0.4951182025043588
avg acc : 0.8325234117944824
Epoch 8/32


100%|██████████| 1235/1235 [01:04<00:00, 19.06it/s]


avg loss: 0.4621038211381387
avg acc : 0.8450771956466717
Epoch 9/32


100%|██████████| 1235/1235 [01:04<00:00, 19.08it/s]


avg loss: 0.43835303608463844
avg acc : 0.8543659832953683
Epoch 10/32


100%|██████████| 1235/1235 [01:05<00:00, 18.98it/s]


avg loss: 0.41448259582886327
avg acc : 0.864262212098203
Epoch 11/32


100%|██████████| 1235/1235 [01:04<00:00, 19.06it/s]


avg loss: 0.3953434998327904
avg acc : 0.8701088332067831
Epoch 12/32


100%|██████████| 1235/1235 [01:04<00:00, 19.07it/s]


avg loss: 0.37967033253507576
avg acc : 0.8766135155656796
Epoch 13/32


100%|██████████| 1235/1235 [01:04<00:00, 19.07it/s]


avg loss: 0.36410564709047555
avg acc : 0.8813718046064287
Epoch 14/32


100%|██████████| 1235/1235 [01:04<00:00, 19.11it/s]


avg loss: 0.3468519591790462
avg acc : 0.8887117185522653
Epoch 15/32


100%|██████████| 1235/1235 [01:04<00:00, 19.10it/s]


avg loss: 0.3361218595492695
avg acc : 0.8933181473044799
Epoch 16/32


100%|██████████| 1235/1235 [01:04<00:00, 19.09it/s]


avg loss: 0.3231180563146769
avg acc : 0.8983295368261199
Epoch 17/32


100%|██████████| 1235/1235 [01:04<00:00, 19.10it/s]


avg loss: 0.3109030331375628
avg acc : 0.9028600354340673
Epoch 18/32


100%|██████████| 1235/1235 [01:04<00:00, 19.09it/s]


avg loss: 0.301549285107296
avg acc : 0.9067324727916983
Epoch 19/32


100%|██████████| 1235/1235 [01:04<00:00, 19.06it/s]


avg loss: 0.2912261272912566
avg acc : 0.9100733991394584
Epoch 20/32


100%|██████████| 1235/1235 [01:04<00:00, 19.09it/s]


avg loss: 0.28143014861987187
avg acc : 0.9141989369779803
Epoch 21/32


100%|██████████| 1235/1235 [01:04<00:00, 19.12it/s]


avg loss: 0.2744057134517774
avg acc : 0.917160212604404
Epoch 22/32


100%|██████████| 1235/1235 [01:04<00:00, 19.07it/s]


avg loss: 0.2665395705806099
avg acc : 0.9198936977980258
Epoch 23/32


100%|██████████| 1235/1235 [01:04<00:00, 19.05it/s]


avg loss: 0.2589831673905917
avg acc : 0.9231333839534295
Epoch 24/32


100%|██████████| 1235/1235 [01:04<00:00, 19.04it/s]


avg loss: 0.25073659392262276
avg acc : 0.9276638825613769
Epoch 25/32


100%|██████████| 1235/1235 [01:04<00:00, 19.13it/s]


avg loss: 0.24380939290349782
avg acc : 0.9273854720323969
Epoch 26/32


100%|██████████| 1235/1235 [01:04<00:00, 19.11it/s]


avg loss: 0.23880300288680595
avg acc : 0.9305745380916224
Epoch 27/32


100%|██████████| 1235/1235 [01:04<00:00, 19.06it/s]


avg loss: 0.23359678971320993
avg acc : 0.9326499620349279
Epoch 28/32


100%|██████████| 1235/1235 [01:04<00:00, 19.06it/s]


avg loss: 0.2273765985088551
avg acc : 0.9356618577575297
Epoch 29/32


100%|██████████| 1235/1235 [01:04<00:00, 19.09it/s]


avg loss: 0.22124134683415958
avg acc : 0.9372057706909643
Epoch 30/32


100%|██████████| 1235/1235 [01:04<00:00, 19.12it/s]


avg loss: 0.2156912415254454
avg acc : 0.9398127056441408
Epoch 31/32


100%|██████████| 1235/1235 [01:04<00:00, 19.14it/s]


avg loss: 0.21141327249436725
avg acc : 0.9407491774234371
Epoch 32/32


100%|██████████| 1235/1235 [01:04<00:00, 19.07it/s]

avg loss: 0.20577886733509268
avg acc : 0.9421918501645153





In [18]:
n = 0
acc = 0
for x, y in tqdm(test_dataloader):
    pred, prob = model(x.to(device))
    output = pred.argmax(axis = 1)
    n += y.shape[0]
    acc += (output == y.to(device)).sum().item()
print('avg test acc:', acc / n)

100%|██████████| 412/412 [00:08<00:00, 48.07it/s]

avg test acc: 0.7240148811783463





In [23]:
# Save entire model

PATH = '/content/sentiment_analysis_model.h5'
torch.save(model, PATH)

# **Predicting**

In [19]:
def predict_sentiment(text):
    input_tensor = torch.from_numpy(pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = max_len))
    pred, prob = model(input_tensor.to(device))
    pred = pred.cpu().detach().numpy().argmax(axis=1).flatten()[0]
    return to_class_name(pred), prob.max().item()

In [24]:
predict_sentiment(input("What's your matter?\n"))

What's your matter?
my life is going to be perfect


('Normal', 0.6652073264122009)

In [None]:
# im hopeless, need someone here before i kill myself
# I feel very stressed from a few days everyone hates me I have not been happy from over a month
# this is the happiest day of my life, my life is going to be perfect  // 1 by 1: normal, both: suicide??