In [113]:
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
import os
print(os.path.exists(r'../../data/data.csv'))

True


In [114]:
df = pd.read_csv(r'../../data/data.csv')

df['status'].unique()


array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [115]:
df.info()
df.head()
df.isna().sum()

label_map = {
    "Normal": 0,
    "Depression": 1,
    "Suicidal": 2,
    "Anxiety": 3,
    "Stress": 4,
    "Bipolar": 5,
    "Personality disorder": 6
}

df['status'] = df['status'].map(label_map)
df['status'] = df['status'].astype(int)
#Converts the statement column to a string
df['statement'] = df['statement'].astype(str)
#Makes the text all lowercase 
df['statement'] = df['statement'].astype(str).str.lower()
#Drops the rows where the text is missing
df = df.dropna(subset=['status'])

#Removes punctuation
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['statement'] = df['statement'].apply(clean_text)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [116]:

train_df = df.sample(frac=0.75, random_state=42)
test_df  = df.drop(train_df.index)

train_ds = tf.data.Dataset.from_tensor_slices((
    train_df['statement'].values,
    train_df['status'].values
))

test_ds = tf.data.Dataset.from_tensor_slices((
    test_df['statement'].values,
    test_df['status'].values
))

train_ds = train_ds.shuffle(10000).batch(32)
test_ds = test_ds.batch(32)

In [117]:
# Create encoder (vectorizer)
encoder = tf.keras.layers.TextVectorization(
    max_tokens=10000,
    output_sequence_length=200
)

# Adapt using *unbatched* text data
encoder.adapt(
    train_ds.unbatch().map(lambda text, label: text)
)

# Get vocabulary
vocabulary = np.array(encoder.get_vocabulary())

# Take a single example from dataset
example_text, example_label = next(iter(train_ds.unbatch()))

# Encode
encoded = encoder(example_text)
encoded_numpy = encoded.numpy()

# Decode back to words
decoded = ' '.join(vocabulary[encoded_numpy])

print('Original:', example_text.numpy().decode())
print('Encoded:', encoded_numpy)
print('Decoded:', decoded)


Original: survey on situational stress and music 18 and up i am a research student doing research on situational stress please help me and complete my survey for this project thank you
httpsformsglejdguzqmlxrncufxd7httpsformsglejdguzqmlxrncufxd7
Encoded: [1673   27 4899  287    4  667  657    4   45    2   15    7 1180  916
  157 1180   27 4899  287  265   83   11    4  873    6 1673   19   21
 1682  438   29    1    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0

In [118]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(1,), dtype=tf.string),
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(7)  # 7 classes
])


In [119]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)


In [120]:
labels = train_df['status'].values
#Automatically creates class weights based on how common the class is in the dataset
class_weights = class_weight.compute_class_weight(
    'balanced', classes=np.unique(labels), y=labels
)
class_weights_dict = dict(enumerate(class_weights))

history = model.fit(
    train_ds,
    epochs=10,
    validation_data=test_ds,
    class_weight=class_weights_dict
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [121]:
loss, accuracy = model.evaluate(test_ds)
print(f"Test accuracy: {accuracy:.4f}")


Test accuracy: 0.7555


In [122]:
new_texts = [
    "I feel very stressed and anxious these days.",
    "I am happy and I feel normal"
]

predictions = model.predict(new_texts)

predicted_classes = tf.argmax(predictions, axis=1).numpy()

print(predicted_classes)

inv_label_map = {v: k for k, v in label_map.items()}

for i, text in enumerate(new_texts):
    print(f"Text: {text}")
    print(f"Predicted label: {inv_label_map[predicted_classes[i]]}")


[4 1]
Text: I feel very stressed and anxious these days.
Predicted label: Stress
Text: I am happy and I feel normal
Predicted label: Depression


In [124]:
preds = model.predict(["Everything is on fire around me and I don't know what to do"])
probs = tf.nn.softmax(preds, axis=1).numpy()
for i, prob in enumerate(probs[0]):
    print(f"{inv_label_map[i]}: {prob:.2f}")


Normal: 0.65
Depression: 0.22
Suicidal: 0.12
Anxiety: 0.01
Stress: 0.00
Bipolar: 0.00
Personality disorder: 0.00
