In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [3]:
# ----------------------------
# Load raw text dataset
# ----------------------------
df = pd.read_csv("data/Depression_Severity_Levels_Dataset.csv")  # contains 'text' and 'label' columns
df.head()

Unnamed: 0,text,label
0,"He said he had not felt that way before, sugge...",mild
1,"Hey there r/assistance, Not sure if this is th...",minimum
2,My mom then hit me with the newspaper and it s...,minimum
3,"until i met my new boyfriend, he is amazing, h...",mild
4,October is Domestic Violence Awareness Month a...,moderate


In [4]:
df = df.dropna(subset=['text'])

In [5]:
df.head()

Unnamed: 0,text,label
0,"He said he had not felt that way before, sugge...",mild
1,"Hey there r/assistance, Not sure if this is th...",minimum
2,My mom then hit me with the newspaper and it s...,minimum
3,"until i met my new boyfriend, he is amazing, h...",mild
4,October is Domestic Violence Awareness Month a...,moderate


In [6]:
# Clean label column
df['label'] = df['label'].str.lower().str.strip().str.replace(")", "", regex=False)

In [7]:
# Convert to binary classification
yes_labels = ['mild', 'moderate', 'severe', 'extreme']
no_labels = ['minimum', 'none']
df['binary_label'] = df['label'].apply(lambda x: 'YES' if x in yes_labels else ('NO' if x in no_labels else 'UNKNOWN'))

# Drop unclassified labels
df = df[df['binary_label'] != 'UNKNOWN']

In [8]:
# Encode binary labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['binary_label'])  # YES=1, NO=0

In [9]:
# ----------------------------
# Tokenization and padding
# ----------------------------
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


In [10]:
# ----------------------------
# Train/test split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label_encoded'], test_size=0.2, random_state=42)
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [11]:
# ----------------------------
# Build LSTM model
# ----------------------------
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=64, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(128),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(2, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
# ----------------------------
# Train LSTM
# ----------------------------
model.fit(X_train, y_train_cat, validation_data=(X_test, y_test_cat), epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x228e058de80>

In [13]:
# ----------------------------
# Evaluate
# ----------------------------
loss, acc = model.evaluate(X_test, y_test_cat, verbose=0)
print(f"\n✅ LSTM Test Accuracy: {acc:.4f}")


✅ LSTM Test Accuracy: 0.8530


In [14]:
new_text = ["I feel hopeless and tired all the time."]

In [15]:
# ----------------------------
# Preprocess using the same tokenizer and padding
# ----------------------------
sequence = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [16]:
# ----------------------------
# Predict
# ----------------------------
pred = model.predict(padded)
predicted_class = tf.argmax(pred, axis=1).numpy()[0]



In [17]:
# Decode the label
label = le.inverse_transform([predicted_class])[0]

In [18]:
print(f"Input: {new_text[0]}")
print(f"Predicted Class: {label} (confidence = {pred[0][predicted_class]:.4f})")

Input: I feel hopeless and tired all the time.
Predicted Class: YES (confidence = 0.9948)
