In [13]:
# Import necessary libraries
import pandas as pd
from collections import Counter
#from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
import nltk
nltk.download('punkt_tab')
# Download the VADER lexicon
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download("averaged_perceptron_tagger")
# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [17]:
!pip install gradio
import gradio as gr

Collecting gradio
  Downloading gradio-5.7.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [2]:
emotion_model = None
suicide_model = None

In [3]:
# Train the emotion model and store it in global variables
def initialize_emotion_model(goemotions_df):
    global emotion_model

    # Train the emotion model
    features, trained_emotion_model = train_emotion_model(goemotions_df)

    # Store globally
    emotion_model = trained_emotion_model
    #feature_extractor = trained_feature_extractor

    print("Emotion model and feature extractor initialized.")
    return features


In [38]:


# 1. Extract POS tag features (counts of POS tags)
def extract_pos_tags(texts):
    pos_features = []
    for text in texts:
        pos_tags = pos_tag(word_tokenize(text))
        pos_count = Counter(tag for _, tag in pos_tags)
        pos_features.append(pos_count)
    pos_df = pd.DataFrame(pos_features).fillna(0)
    return pos_df

# 2. Extract pronoun features (First-person singular and plural)
def extract_pronoun_features(texts):
    fps_pronouns = {"i", "me", "my", "mine", "myself"}
    fpp_pronouns = {"we", "us", "our", "ours", "ourselves"}

    fps_count = []
    fpp_count = []

    for text in texts:
        tokens = word_tokenize(text.lower())
        fps_count.append(sum(1 for token in tokens if token in fps_pronouns))
        fpp_count.append(sum(1 for token in tokens if token in fpp_pronouns))

    return fps_count, fpp_count

# 3. Sentiment Features using VADER (SentimentIntensityAnalyzer)
def extract_sentiment_features(texts):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = [sid.polarity_scores(text) for text in texts]
    sentiment_df = pd.DataFrame(sentiment_scores)
    return sentiment_df

# 4. Simple word count feature (total number of words in the text)
'''def extract_word_count(texts):
    word_count = [len(word_tokenize(text)) for text in texts]
    return pd.DataFrame({'word_count': word_count})'''
   # 4. Absolutist Words
def extract_word_count(texts):
    absolutist_words = {"always", "never", "must", "all", "completely", "entirely"}
    absolutist_count = [
      sum(1 for word in word_tokenize(text.lower()) if word in absolutist_words)
      for text in texts]

    # Convert to DataFrame
    features_df = pd.DataFrame({'absolutist_count': absolutist_count})

    return features_df


In [39]:
# Main feature extraction function using simpler features
def extract_features_simpler(texts):
    features = pd.DataFrame()

    # Extract POS tags
    pos_df = extract_pos_tags(texts)
    features = pd.concat([features, pos_df], axis=1)

    # Extract pronouns (First person singular and plural)
    fps_count, fpp_count = extract_pronoun_features(texts)
    features['fps_count'] = fps_count
    features['fpp_count'] = fpp_count

    # Extract sentiment scores
    sentiment_df = extract_sentiment_features(texts)
    features = pd.concat([features, sentiment_df], axis=1)
    word_count_df=extract_word_count(texts)
    features = pd.concat([features, word_count_df], axis=1)

    # Extract word count
    '''word_count_df = extract_word_count(texts)
    features = pd.concat([features, word_count_df], axis=1)'''

    return features

In [6]:
# Train emotion model function using the new feature extraction
def train_emotion_model(goemotions_df):
    # Extract simple features from the GoEmotions dataset
    features = extract_features_simpler(goemotions_df['text'])

    # Emotion columns from GoEmotions dataset (consider all emotions)
    emotion_columns = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire',
                       'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
                       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

    y_emotions = goemotions_df[emotion_columns]

    # Train an emotion prediction model (SVC) using MultiOutputClassifier
    base_model = SVC(kernel='linear')  # Create a base SVC model
    emotion_model = MultiOutputClassifier(base_model)  # Wrap it with MultiOutputClassifier
    emotion_model.fit(features, y_emotions)

    return features,emotion_model

In [8]:
# Updated extract_features function for suicide detection model
def extract_features_suicide(texts, emotion_model):
    # 6. Emotion Features from the previously trained GoEmotions model (for suicide detection)
    features1=extract_features_simpler(texts)
    training_feature_names = emotion_model.estimators_[0].feature_names_in_

    # Reindex features1 to match the training data's feature names
    features1 = features1.reindex(columns=training_feature_names, fill_value=0)

    emotion_predictions = emotion_model.predict(features1)  # Get emotion scores (e.g., sadness, fear, etc.)
    emotion_columns = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride','realization','relief', 'remorse', 'sadness', 'surprise', 'neutral']

    # Combine predicted emotion features into the features DataFrame
    emotion_df = pd.DataFrame(emotion_predictions, columns=emotion_columns)

    return emotion_df

In [9]:
# Train and test the second model for suicide detection
def train_and_test_suicide_detection(goemotions_df, test_dataset_df):
    global suicide_model
    # Load only the first 500 samples from each dataset
    goemotions_df = goemotions_df.head(500)
    test_dataset_df = test_dataset_df.head(500)
    # Train the first emotion prediction model (on GoEmotions dataset)
    features =initialize_emotion_model(goemotions_df)

     # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(test_dataset_df['text'], test_dataset_df['label'], test_size=0.2, random_state=42)

    # Predict emotions on the suicide dataset using the trained emotion model
    suicide_features = extract_features_suicide(X_train,emotion_model)

    # Prepare the labels for suicide detection
    label_encoder = LabelEncoder()
    test_dataset_df['label'] = label_encoder.fit_transform(test_dataset_df['class'])

    # Train the suicide detection model
    suicide_model = SVC(kernel='linear')
    suicide_model.fit(suicide_features, y_train)
    X_test_features=extract_features_suicide(X_test,emotion_model)

    # Test the suicide detection model
    y_pred = suicide_model.predict(X_test_features)
    print(classification_report(y_test, y_pred))

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# Mount Google Drive
from tqdm import tqdm  # Import tqdm for progress bar
# Paths to datasets
goemotions_path = "/content/drive/MyDrive/goemotions_1.csv"
test_dataset_path = "/content/drive/MyDrive/Suicide_Detection.csv"

# Load datasets
goemotions_df = pd.read_csv(goemotions_path)
test_dataset_df = pd.read_csv(test_dataset_path)

# Drop missing values
goemotions_df.dropna(inplace=True)
test_dataset_df.dropna(inplace=True)

# Apply LabelEncoder to the test dataset
label_encoder = LabelEncoder()
test_dataset_df['label'] = label_encoder.fit_transform(test_dataset_df['class'])

# Show an interactive progress bar for training
tqdm.pandas(desc="Training the Suicide Detection Model")

# Run the training and testing function
train_and_test_suicide_detection(goemotions_df, test_dataset_df)

Emotion model and feature extractor initialized.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_df['label'] = label_encoder.fit_transform(test_dataset_df['class'])


              precision    recall  f1-score   support

           0       0.78      0.52      0.62        48
           1       0.66      0.87      0.75        52

    accuracy                           0.70       100
   macro avg       0.72      0.69      0.69       100
weighted avg       0.72      0.70      0.69       100



In [42]:
def predict(text):
    # Generate features for the input text
    features = extract_features_suicide([text], emotion_model)

    # Predict emotions

    # Predict suicide detection
    prediction = suicide_model.predict(features)

    # Decode the label back to its original class name
    class_label = label_encoder.inverse_transform(prediction)[0]

    return f"Predicted Class: {class_label}"
text="I found the location. Next step is to get the rope, and hold off till the date comes"
print(predict(text))


Predicted Class: non-suicide


In [22]:
!pip install --upgrade gradio




In [45]:
interface = gr.Interface(
        fn=predict,
        inputs="text",
        outputs="text",
        title="Suicidal/not",
        description="Input a sentence to know whether suicidal text or not."
    )
interface.launch()


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

