# **Prepare Dataset**

In [1]:
!pip install -q kaggle

In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d "suchintikasarkar/sentiment-analysis-for-mental-health"

Dataset URL: https://www.kaggle.com/datasets/suchintikasarkar/sentiment-analysis-for-mental-health
License(s): DbCL-1.0
Downloading sentiment-analysis-for-mental-health.zip to /content
 90% 10.0M/11.1M [00:01<00:00, 13.7MB/s]
100% 11.1M/11.1M [00:01<00:00, 8.87MB/s]


In [3]:
import zipfile

dataset_zip = zipfile.ZipFile("/content/sentiment-analysis-for-mental-health.zip", "r")
dataset_zip.extractall()
dataset_zip.close()

# **Read and Define Data to Variable**

In [1]:
import pandas as pd
# raw_data = pd.read_csv("/content/Combined Data.csv")
raw_data = pd.read_csv("../data/sentiments.csv")

In [2]:
df = raw_data.dropna()
print("Total data (rows):", len(df), "\n")

df.sample(frac = 1).head()

Total data (rows): 52681 



Unnamed: 0.1,Unnamed: 0,statement,status
8649,8649,It pains me that this has happened to me. Gran...,Depression
18380,18380,every night i son and cry and promise myself t...,Depression
48195,48195,Sometimes I find comfort in my depression It h...,Depression
7701,7701,I do not have any motivation to be alive anymo...,Depression
14741,14741,Who knows who was the first person who thought...,Depression


# **Label Encoding**

In [3]:
classes = df['status'].unique()
print("Total classes/ labels:", len(classes), "\n")
classes

Total classes/ labels: 7 



array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [4]:
import numpy as np
import pandas as pd

def to_class_num(class_name):
    class_list_num = np.where(classes == class_name)[0][0]
    return class_list_num

def to_class_name(class_num):
    return classes[class_num]

classes_pd = pd.DataFrame({
    "Class Name" : classes,
    "Value" : [to_class_num(class_name) for class_name in classes]
})
display(classes_pd)

Unnamed: 0,Class Name,Value
0,Anxiety,0
1,Normal,1
2,Depression,2
3,Suicidal,3
4,Stress,4
5,Bipolar,5
6,Personality disorder,6


# **Train-Test Split & Preprocessing Dataset**

In [5]:
from sklearn.model_selection import train_test_split

x_data = df['statement']
y_data = df['status']

x_train, x_test, y_train, y_test = train_test_split(x_data.values, y_data.values)
print(len(x_train), len(y_train), len(x_test), len(y_test))



39510 39510 13171 13171


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token='UNK', lower = True)
tokenizer.fit_on_texts(x_data.values)

x_train_tokenized = tokenizer.texts_to_sequences(x_train)
x_test_tokenized = tokenizer.texts_to_sequences(x_test)

In [7]:
max_len = max([len(x) for x in x_train_tokenized])
print(max_len)

6300


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_tokenized_padded = pad_sequences(x_train_tokenized, maxlen = max_len)
x_test_tokenized_padded = pad_sequences(x_test_tokenized, maxlen = max_len)

x_train_tokenized_padded[0]

array([  0,   0,   0, ...,  43,   5, 120])

In [9]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

63341

# **Defining Device and Transforming Dataset**

In [10]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Device:", device, "\n")

Device: cpu 



In [11]:
from torch.utils.data import Dataset

class CustomizedDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        x = torch.from_numpy(self.x[idx])
        y = torch.tensor(to_class_num(self.y[idx]))
        return x, y

sample = CustomizedDataset(x_train_tokenized_padded, y_train)
sample[0]

(tensor([  0,   0,   0,  ...,  43,   5, 120], dtype=torch.int32), tensor(1))

# **Batch Data Loading**

In [12]:
from torch.utils.data import DataLoader

batch_size = 24
train_dataloader = DataLoader(CustomizedDataset(x_train_tokenized_padded, y_train), shuffle = True, batch_size = batch_size)
test_dataloader = DataLoader(CustomizedDataset(x_test_tokenized_padded, y_test), shuffle = True, batch_size = batch_size)

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x22df6ea3a60>

# **Build Model**

In [13]:
import torch.nn as nn

num_lstm = 1
num_hidden = 64
embedding_size = 64

softmax = nn.Softmax(dim=1)

class SentimentAnalysisModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embd = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.lstm = nn.LSTM(embedding_size, num_hidden, bidirectional = True, num_layers=num_lstm)
        self.linear = nn.Linear(2 * num_hidden * max_len, len(classes))

    def forward(self, x):
        logits = self.embd(x)
        logits , (h_n, c_n) = self.lstm(logits)
        logits = logits.flatten(start_dim = 1, end_dim=-1)
        logits = self.linear(logits)
        probability = softmax(logits)
        return logits, probability

model = SentimentAnalysisModel()
model

SentimentAnalysisModel(
  (embd): Embedding(63341, 64, padding_idx=0)
  (lstm): LSTM(64, 64, bidirectional=True)
  (linear): Linear(in_features=806400, out_features=7, bias=True)
)

In [14]:
test = x_train_tokenized_padded[0]
test = torch.from_numpy(test[None, :])
print(test.shape)
pred, prob = model(test)
print(pred.shape)

torch.Size([1, 6300])
torch.Size([1, 7])


In [15]:
from tqdm import tqdm
import torch
import torch.nn as nn

cross_entropy = nn.CrossEntropyLoss()
model = model.to(device)
optimizer = torch.optim.Adagrad(model.parameters(), lr=1e-2, weight_decay=1e-4)
epochs = 3   # 32

avg_loss = []
avg_acc = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    acc = 0
    loss_acc = 0
    n = 0

    for x, y in tqdm(train_dataloader):
        x, y = x.to(device), y.to(device)

        pred, prob = model(x)

        loss = cross_entropy(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = pred.argmax(axis=1)

        acc += (output == y).sum().item()
        n += y.shape[0]
        loss_acc += loss.item()

    avg_loss.append(loss_acc / len(train_dataloader))
    avg_acc.append(acc / n)
    print('avg loss:', loss_acc / len(train_dataloader))
    print('avg acc :', acc / n)

Epoch 1/3


 16%|█▌        | 262/1647 [05:17<27:59,  1.21s/it]  


KeyboardInterrupt: 

In [156]:
model.eval()
n = 0
acc = 0
for x, y in tqdm(test_dataloader):
    pred, prob = model(x.to(device))
    output = pred.argmax(axis = 1)
    n += y.shape[0]
    acc += (output == y.to(device)).sum().item()
print('avg test acc:', acc / n)

100%|██████████| 549/549 [00:09<00:00, 58.17it/s]

avg test acc: 0.701009794244932





In [133]:
# Save entire model

PATH = '/content/my_model_4.h5'
torch.save(model, PATH)

# **Predicting**

In [164]:
def predict_sentiment(text):
    input_tensor = torch.from_numpy(pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = max_len))
    pred, prob = model(input_tensor.to(device))
    pred = pred.cpu().detach().numpy().argmax(axis=1).flatten()[0]
    return to_class_name(pred), prob.max().item()

predict_sentiment(input("What's your matter?\n"))

What's your matter?
im hopeless, need someone here before i kill myself


('Suicidal', 0.8255137205123901)

In [155]:
# im hopeless, need someone here before i kill myself
# I feel very stressed from a few days everyone hates me I have not been happy from over a month
# this is the happiest day of my life, my life is going to be perfect  // 1 by 1: normal, both: suicide??