In [16]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [17]:
# Load the datasets
suicide_detection_df = pd.read_csv('Suicide_Detection.csv')
ideation_df = pd.read_csv('Suicide_Ideation_Dataset(Twitter-based).csv')

In [18]:
# Handle missing values in the Ideation dataset
ideation_df = ideation_df.dropna(subset=['Text'])

In [19]:
# Convert categorical labels to numerical format
label_encoder_sd = LabelEncoder()
suicide_detection_df['class'] = label_encoder_sd.fit_transform(suicide_detection_df['class'])
label_encoder_ideation = LabelEncoder()
ideation_df['Semantic'] = label_encoder_ideation.fit_transform(ideation_df['Semantic'])

In [20]:
# Combine the text data from both datasets
combined_texts = pd.concat([suicide_detection_df['text'], ideation_df['Text']], axis=0)
combined_labels = pd.concat([suicide_detection_df['class'], ideation_df['Semantic']], axis=0)

In [21]:
# Tokenize and pad the sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(combined_texts)

sequences = tokenizer.texts_to_sequences(combined_texts)
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [22]:
# Split the data back into respective datasets
suicide_detection_sequences = padded_sequences[:len(suicide_detection_df)]
ideation_sequences = padded_sequences[len(suicide_detection_df):]

In [23]:
# Extract labels
suicide_detection_labels = combined_labels[:len(suicide_detection_df)].values
ideation_labels = combined_labels[len(suicide_detection_df):].values

In [24]:
# Combine datasets (optional)
all_sequences = np.concatenate([suicide_detection_sequences, ideation_sequences])
all_labels = np.concatenate([suicide_detection_labels, ideation_labels])

In [25]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(all_sequences, all_labels, test_size=0.2, random_state=42)

In [26]:
# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [42]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [43]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=64)

Epoch 1/5
[1m2924/2924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 15ms/step - accuracy: 0.9722 - loss: 0.0763 - val_accuracy: 0.9379 - val_loss: 0.1890
Epoch 2/5
[1m2924/2924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 15ms/step - accuracy: 0.9784 - loss: 0.0611 - val_accuracy: 0.9377 - val_loss: 0.1977
Epoch 3/5
[1m2924/2924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 15ms/step - accuracy: 0.9833 - loss: 0.0482 - val_accuracy: 0.9379 - val_loss: 0.2313
Epoch 4/5
[1m2924/2924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 15ms/step - accuracy: 0.9870 - loss: 0.0376 - val_accuracy: 0.9363 - val_loss: 0.2696
Epoch 5/5
[1m2924/2924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 15ms/step - accuracy: 0.9900 - loss: 0.0293 - val_accuracy: 0.9343 - val_loss: 0.2448


In [44]:
# Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

[1m1462/1462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step


In [45]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 93.43%


In [46]:
import joblib

# Save the model
joblib.dump(model, 'suicide_detection_model.joblib')

['suicide_detection_model.joblib']

In [48]:
# Sample input text
sample_text = ["I feel like it's over"]

# Preprocess the input text (tokenization and padding)
sample_sequence = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequence, maxlen=100, padding='post', truncating='post')

# Predict the class using the trained model
prediction = model.predict(sample_padded)
predicted_class = (prediction > 0.5).astype(int)
print(prediction[0][0]*100)
# Decode the predicted class
if predicted_class[0] == 0:
    print("The model predicts: Non-Suicide")
else:
    print("The model predicts: Suicide")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
88.784658908844
The model predicts: Suicide
