In [3]:
pip install pandas numpy nltk tensorflow joblib imbalanced-learn scikit-learn



In [11]:
import pandas as pd
import re
import numpy as np
import nltk
import tensorflow as tf
import joblib
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, LayerNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [12]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Load dataset
file_path = "/content/Combined Data.csv"
df = pd.read_csv(file_path)

In [9]:
# Remove rows where 'statement' is null or empty
df = df[df['statement'].notna() & df['statement'].str.strip().ne('')]

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [10]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
statement,0
status,0


In [14]:
# Drop unnecessary column and missing statements
df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)
df.dropna(subset=['statement'], inplace=True)

In [16]:
# Text Preprocessing
def preprocess_text(text):
    text = text.lower().strip()  # Convert to lowercase and remove extra spaces
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Normalize multiple spaces
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]  # Lemmatization
    return ' '.join(words)

In [15]:
df.shape

(52681, 2)

In [17]:
# Apply preprocessing
df['cleaned_statement'] = df['statement'].apply(preprocess_text)

In [18]:
# Remove very short sentences
df = df[df['cleaned_statement'].str.split().str.len() > 3]

In [19]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['status'])
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [20]:
# Handle class imbalance using oversampling
X = df['cleaned_statement']
y = df['label']
oversample = RandomOverSampler()
X_resampled, y_resampled = oversample.fit_resample(X.values.reshape(-1,1), y)
X_resampled = X_resampled.flatten()


In [21]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [22]:
# Tokenization settings
MAX_WORDS = 30000
MAX_SEQ_LEN = 200

In [23]:
# Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [24]:
# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [25]:
# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

In [26]:
# Save tokenizer
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [29]:
embedding_index = {}
with open("/content/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


In [30]:
# Prepare Embedding Matrix
embedding_matrix = np.zeros((MAX_WORDS, 100))
for word, i in tokenizer.word_index.items():
    if i < MAX_WORDS:
        vector = embedding_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector

In [31]:
# Build Optimized LSTM Model
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=MAX_SEQ_LEN, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    LayerNormalization(),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])



In [32]:
# Compile Model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
# Train Model
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=20, batch_size=32)

Epoch 1/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 31ms/step - accuracy: 0.3133 - loss: 1.6387 - val_accuracy: 0.6982 - val_loss: 0.8756
Epoch 2/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 30ms/step - accuracy: 0.7125 - loss: 0.8308 - val_accuracy: 0.7909 - val_loss: 0.6114
Epoch 3/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 30ms/step - accuracy: 0.7908 - loss: 0.6082 - val_accuracy: 0.8264 - val_loss: 0.5034
Epoch 4/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 30ms/step - accuracy: 0.8330 - loss: 0.4728 - val_accuracy: 0.8511 - val_loss: 0.4200
Epoch 5/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 28ms/step - accuracy: 0.8576 - loss: 0.3982 - val_accuracy: 0.8623 - val_loss: 0.3888
Epoch 6/20
[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 28ms/step - accuracy: 0.8750 - loss: 0.3474 - val_accuracy: 0.8798 - val_loss: 0.3418
Epoc

<keras.src.callbacks.history.History at 0x7f747f940f90>

In [34]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Model Accuracy: {accuracy:.4f}")

[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9099 - loss: 0.2948
Model Accuracy: 0.9098


In [35]:
# Save Model as .pkl
joblib.dump(model, "mental_health_lstm.pkl")

['mental_health_lstm.pkl']

In [36]:
# Function for Making Predictions
def predict(text):
    text = preprocess_text(text)  # Ensure user input follows the same preprocessing
    seq = tokenizer.texts_to_sequences([text])
    pad_seq = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    pred = model.predict(pad_seq)
    predicted_class = np.argmax(pred, axis=1)[0]
    return label_encoder.inverse_transform([predicted_class])[0]

In [42]:
# Example Predictions
example_texts = [
    "  I feel so tired and hopeless all the time.   ",
    "Life has been going well, and I feel mentally strong.!",
    "I feel so anxious whenever I have to meet new people..",
    "  Sometimes I feel okay, but other times I feel lost."
]

for text in example_texts:
    print(f"Statement: {text.strip()}\nPredicted Status: {predict(text)}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Statement: I feel so tired and hopeless all the time.
Predicted Status: Depression

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Statement: Life has been going well, and I feel mentally strong.!
Predicted Status: Normal

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Statement: I feel so anxious whenever I have to meet new people..
Predicted Status: Anxiety

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Statement: Sometimes I feel okay, but other times I feel lost.
Predicted Status: Depression

