In [25]:
import importlib
import re
from pprint import pprint
from string import punctuation

import nltk
import numpy as np
import pandas as pd
import shap
from datasets import Dataset, DatasetDict
from neattext.functions import clean_text
from nltk.tokenize import TreebankWordTokenizer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
    pipeline,
)

import utils

try:
    importlib.reload(utils)  # reload module
except NameError:
    pass

model_name = "distilbert-base-uncased"

# Load data

In [26]:
path = "input/Suicide_Ideation_Dataset(Twitter-based).csv"
df = pd.read_csv(path)
df.columns = df.columns.str.lower()
df = df.dropna(how="any")

print(df.shape)
df.head()

(1785, 2)


Unnamed: 0,tweet,suicide
0,making some lunch,Not Suicide post
1,@Alexia You want his money.,Not Suicide post
2,@dizzyhrvy that crap took me forever to put to...,Potential Suicide post
3,@jnaylor #kiwitweets Hey Jer! Since when did y...,Not Suicide post
4,Trying out &quot;Delicious Library 2&quot; wit...,Not Suicide post


In [27]:
df["suicide"].value_counts()

suicide
Not Suicide post           1126
Potential Suicide post      659
Name: count, dtype: int64

In [28]:
id2label = {0: "Not Suicide post", 1: "Potential Suicide post"}
label2id = {v: k for k, v in id2label.items()}
df["suicide"] = df["suicide"].str.strip().map(label2id).astype("int")

Split data into train, val and test.

In [29]:
x_train, x_val, x_test, y_train, y_val, y_test = utils.split_dataset(df["tweet"], df["suicide"])

# Text cleaning

In [30]:
for doc, label in zip(x_train[:10], y_train[:10]):
    print(id2label.get(label), ": ", doc)
    print("".join(["="] * 100))

Potential Suicide post :  RT @TGGuide: TransFact: Sadly, around 78% of transgender people who were bullied at school have thought about or attempted suicide. This neâ¦
Not Suicide post :  gonna go to school. be back around 4
Not Suicide post :  @Sassy1inVegas NP? Lol. hhmm will Danny and Jordan join in? Danny maybe Jordan, me hopes not.
Potential Suicide post :  RT @NICKBOYNTON24: "I donât want to die. But, you know, nothing is for certain. And Iâm tired of keeping quiet." @PlayersTribune @CarBombBoâ¦
Not Suicide post :  @seerysm i'm always cheerful when it's sunny at ATL, means I'll get the heck away on time
Potential Suicide post :  @hellomonette,buk - cdo,i dont want to be here anymore
Potential Suicide post :  I feel so lonely
Not Suicide post :  snuggling in bed with my little baby....ahhhh!
Not Suicide post :  @nellclothing @TdMPhotography Crikey! Check out the Manfrotto worshipers!
Potential Suicide post :  death cant be much worse i am 22 male my girlfriend left i have be

In [31]:
for doc in x_train[:20]:
    clean_doc = clean_text(
        doc,
        puncts=False,
        stopwords=False,
        urls=True,
        emails=True,
        numbers=True,
        emojis=True,
        special_char=False,
        phone_num=True,
        non_ascii=True,
        multiple_whitespaces=True,
        contractions=False,
        currency_symbols=True,
        custom_pattern=None,
    )
    clean_doc = re.sub("/", " ", clean_doc)  # Replace /
    clean_doc = re.sub("#", "", clean_doc)  # Replace #
    translator = str.maketrans("", "", punctuation)  # Replace punctuations
    clean_doc = clean_doc.translate(translator)
    print(clean_doc)
    print("".join(["="] * 100))

rt tgguide transfact sadly around  of transgender people who were bullied at school have thought about or attempted suicide this nea
gonna go to school be back around 
sassyinvegas np lol hhmm will danny and jordan join in danny maybe jordan me hopes not
rt nickboynton i donat want to die but you know nothing is for certain and iam tired of keeping quiet playerstribune carbombboa
seerysm im always cheerful when its sunny at atl means ill get the heck away on time
hellomonettebuk  cdoi dont want to be here anymore
i feel so lonely
snuggling in bed with my little babyahhhh
nellclothing tdmphotography crikey check out the manfrotto worshipers
death cant be much worse i am male my girlfriend left i have been staying late in the office for too long because i cant cope with the workload friends are non existent hate my family and they hate me i just want to die i dont see any other option at this point
aussieali heidiheartshugs toothpaste if you use the whitening stuff the bleach will burn a

In [32]:
def clean_doc(doc):
    clean_doc = clean_text(
        doc,
        puncts=False,
        stopwords=False,
        urls=True,
        emails=True,
        numbers=True,
        emojis=True,
        special_char=False,
        phone_num=True,
        non_ascii=True,
        multiple_whitespaces=True,
        contractions=False,
        currency_symbols=True,
        custom_pattern=None,
    )
    clean_doc = re.sub("/", " ", clean_doc)  # Replace /
    clean_doc = re.sub("#", "", clean_doc)  # Replace #
    translator = str.maketrans("", "", punctuation)  # Replace punctuations
    clean_doc = clean_doc.translate(translator)
    return clean_doc


vectorized_clean_doc = np.vectorize(clean_doc)

x_train_cleaned = vectorized_clean_doc(x_train)
x_val_cleaned = vectorized_clean_doc(x_val)
x_test_cleaned = vectorized_clean_doc(x_test)

# Text preprocessing

Now let us remove text that occuers too few times.

In [33]:
tokenizer = TreebankWordTokenizer()

for doc in x_train_cleaned[:5]:
    print(tokenizer.tokenize(doc))

['rt', 'tgguide', 'transfact', 'sadly', 'around', 'of', 'transgender', 'people', 'who', 'were', 'bullied', 'at', 'school', 'have', 'thought', 'about', 'or', 'attempted', 'suicide', 'this', 'nea']
['gon', 'na', 'go', 'to', 'school', 'be', 'back', 'around']
['sassyinvegas', 'np', 'lol', 'hhmm', 'will', 'danny', 'and', 'jordan', 'join', 'in', 'danny', 'maybe', 'jordan', 'me', 'hopes', 'not']
['rt', 'nickboynton', 'i', 'donat', 'want', 'to', 'die', 'but', 'you', 'know', 'nothing', 'is', 'for', 'certain', 'and', 'iam', 'tired', 'of', 'keeping', 'quiet', 'playerstribune', 'carbombboa']
['seerysm', 'im', 'always', 'cheerful', 'when', 'its', 'sunny', 'at', 'atl', 'means', 'ill', 'get', 'the', 'heck', 'away', 'on', 'time']


In [34]:
def tokenize_doc(doc):
    return tokenizer.tokenize(doc)


x_train_tokenized = [tokenize_doc(doc) for doc in x_train_cleaned]
x_val_tokenized = [tokenize_doc(doc) for doc in x_val_cleaned]
x_test_tokenized = [tokenize_doc(doc) for doc in x_test_cleaned]

for doc in x_train_tokenized[:10]:
    print(doc)

['rt', 'tgguide', 'transfact', 'sadly', 'around', 'of', 'transgender', 'people', 'who', 'were', 'bullied', 'at', 'school', 'have', 'thought', 'about', 'or', 'attempted', 'suicide', 'this', 'nea']
['gon', 'na', 'go', 'to', 'school', 'be', 'back', 'around']
['sassyinvegas', 'np', 'lol', 'hhmm', 'will', 'danny', 'and', 'jordan', 'join', 'in', 'danny', 'maybe', 'jordan', 'me', 'hopes', 'not']
['rt', 'nickboynton', 'i', 'donat', 'want', 'to', 'die', 'but', 'you', 'know', 'nothing', 'is', 'for', 'certain', 'and', 'iam', 'tired', 'of', 'keeping', 'quiet', 'playerstribune', 'carbombboa']
['seerysm', 'im', 'always', 'cheerful', 'when', 'its', 'sunny', 'at', 'atl', 'means', 'ill', 'get', 'the', 'heck', 'away', 'on', 'time']
['hellomonettebuk', 'cdoi', 'dont', 'want', 'to', 'be', 'here', 'anymore']
['i', 'feel', 'so', 'lonely']
['snuggling', 'in', 'bed', 'with', 'my', 'little', 'babyahhhh']
['nellclothing', 'tdmphotography', 'crikey', 'check', 'out', 'the', 'manfrotto', 'worshipers']
['death', 'c

In [35]:
from collections import Counter

all_words = [word for doc in x_train_tokenized for word in doc]
word_counts = Counter(all_words)

min_frequency = 3

# Remove words with a frequency below the threshold
filtered_vocab = {
    word: count for word, count in word_counts.items() if count >= min_frequency
}
filtered_vocab = filtered_vocab.keys()
x_train_tokenized = [
    [word for word in doc if word in filtered_vocab] for doc in x_train_tokenized
]

In [36]:
x_train_tokenized[:3]

[['rt',
  'sadly',
  'around',
  'of',
  'people',
  'who',
  'were',
  'bullied',
  'at',
  'school',
  'have',
  'thought',
  'about',
  'or',
  'attempted',
  'suicide',
  'this'],
 ['gon', 'na', 'go', 'to', 'school', 'be', 'back', 'around'],
 ['lol', 'will', 'and', 'join', 'in', 'maybe', 'me', 'not']]

# Dataset preparation

Prepare the data into format that is accepted by the Transformers pipeline. It will need to look like this:
```
>>> imdb["test"][0]
{
    "label": 0,
    "text": "I love sci-fi and am willing to put up with a lot.
}
```

In [37]:
# Merge datasets into one
x_train_final = np.array([" ".join(doc) for doc in x_train_tokenized])
x_val_final = np.array([" ".join(doc) for doc in x_val_tokenized])
x_test_final = np.array([" ".join(doc) for doc in x_test_tokenized])
dataset = {
    "train": {"label": y_train, "text": x_train_final},
    "val": {"label": y_val, "text": x_val_final},
    "test": {"label": y_test, "text": x_test_final},
}

# Initialize DatasetDict
dataset_dict = DatasetDict(
    {split_name: Dataset.from_dict(data) for split_name, data in dataset.items()}
)

print(type(dataset_dict))
print(dataset_dict["train"][0])

<class 'datasets.dataset_dict.DatasetDict'>
{'label': 1, 'text': 'rt sadly around of people who were bullied at school have thought about or attempted suicide this'}


Then we tokenize dataset using the model's pretrained tokenizer.

In [38]:
# Load pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Apply the preprocessing function over the entire dataset
# Use map to make it faster
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)


tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/1249 [00:00<?, ? examples/s]

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1249
    })
    val: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 179
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 357
    })
})

In [40]:
tokenized_dataset["train"][2]

{'label': 0,
 'text': 'lol will and join in maybe me not',
 'input_ids': [101,
  8840,
  2140,
  2097,
  1998,
  3693,
  1999,
  2672,
  2033,
  2025,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

# Training and validation

In [41]:
# Load pretrained model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# Initialize data collator that will create batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# Initialize EarlyStoppingCallback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0,
)


# Initialize compute_metrics
def compute_metrics(p):
    # `p` is an instance of `EvalPrediction` containing 'predictions' and 'label_ids'
    # Get the index of the max logit (class prediction)
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids  # True labels
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1}


# Training arguments
training_args = TrainingArguments(
    output_dir="output/model",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir="output/logs",  # Directory to store logs
    logging_steps=10,  # Log every 10 steps
    per_device_train_batch_size=8,  # Reduce this value to lower memory usage
    per_device_eval_batch_size=8,  # Reduce this value for evaluation
)


# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

# Train the model
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1570 [00:00<?, ?it/s]

{'loss': 0.6828, 'grad_norm': 2.7278103828430176, 'learning_rate': 1.9872611464968155e-05, 'epoch': 0.06}
{'loss': 0.6446, 'grad_norm': 2.9503328800201416, 'learning_rate': 1.9745222929936306e-05, 'epoch': 0.13}
{'loss': 0.5779, 'grad_norm': 2.6400654315948486, 'learning_rate': 1.961783439490446e-05, 'epoch': 0.19}
{'loss': 0.4966, 'grad_norm': 2.801243782043457, 'learning_rate': 1.9490445859872614e-05, 'epoch': 0.25}
{'loss': 0.4207, 'grad_norm': 5.3204755783081055, 'learning_rate': 1.9363057324840767e-05, 'epoch': 0.32}
{'loss': 0.2651, 'grad_norm': 7.164059162139893, 'learning_rate': 1.9235668789808918e-05, 'epoch': 0.38}
{'loss': 0.2645, 'grad_norm': 19.734285354614258, 'learning_rate': 1.910828025477707e-05, 'epoch': 0.45}
{'loss': 0.2159, 'grad_norm': 7.855677604675293, 'learning_rate': 1.8980891719745225e-05, 'epoch': 0.51}
{'loss': 0.5373, 'grad_norm': 5.742608547210693, 'learning_rate': 1.8853503184713376e-05, 'epoch': 0.57}
{'loss': 0.3072, 'grad_norm': 17.14903450012207, 'le

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.21032297611236572, 'eval_accuracy': 0.9217877094972067, 'eval_f1': 0.9215412642267836, 'eval_runtime': 13.3765, 'eval_samples_per_second': 13.382, 'eval_steps_per_second': 1.719, 'epoch': 1.0}
{'loss': 0.2041, 'grad_norm': 6.329930782318115, 'learning_rate': 1.796178343949045e-05, 'epoch': 1.02}
{'loss': 0.1671, 'grad_norm': 13.837716102600098, 'learning_rate': 1.78343949044586e-05, 'epoch': 1.08}
{'loss': 0.1912, 'grad_norm': 0.6914505958557129, 'learning_rate': 1.7707006369426754e-05, 'epoch': 1.15}
{'loss': 0.1446, 'grad_norm': 4.3280181884765625, 'learning_rate': 1.7579617834394907e-05, 'epoch': 1.21}
{'loss': 0.0225, 'grad_norm': 0.1481524407863617, 'learning_rate': 1.7452229299363058e-05, 'epoch': 1.27}
{'loss': 0.3547, 'grad_norm': 0.3064810037612915, 'learning_rate': 1.732484076433121e-05, 'epoch': 1.34}
{'loss': 0.039, 'grad_norm': 0.21108219027519226, 'learning_rate': 1.7197452229299365e-05, 'epoch': 1.4}
{'loss': 0.114, 'grad_norm': 34.87861251831055, 'learni

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.23871779441833496, 'eval_accuracy': 0.9217877094972067, 'eval_f1': 0.9215412642267836, 'eval_runtime': 13.3698, 'eval_samples_per_second': 13.388, 'eval_steps_per_second': 1.72, 'epoch': 2.0}
{'loss': 0.1208, 'grad_norm': 27.841747283935547, 'learning_rate': 1.5923566878980894e-05, 'epoch': 2.04}
{'loss': 0.1487, 'grad_norm': 0.8099344372749329, 'learning_rate': 1.5796178343949047e-05, 'epoch': 2.1}
{'loss': 0.1885, 'grad_norm': 2.997605562210083, 'learning_rate': 1.56687898089172e-05, 'epoch': 2.17}
{'loss': 0.0566, 'grad_norm': 0.30567216873168945, 'learning_rate': 1.5541401273885352e-05, 'epoch': 2.23}
{'loss': 0.0971, 'grad_norm': 0.10687977075576782, 'learning_rate': 1.5414012738853506e-05, 'epoch': 2.29}
{'loss': 0.1265, 'grad_norm': 2.8333566188812256, 'learning_rate': 1.528662420382166e-05, 'epoch': 2.36}
{'loss': 0.0943, 'grad_norm': 0.07663331925868988, 'learning_rate': 1.5159235668789811e-05, 'epoch': 2.42}
{'loss': 0.1116, 'grad_norm': 0.7553837299346924, 'l

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.255962997674942, 'eval_accuracy': 0.9273743016759777, 'eval_f1': 0.9272626631756299, 'eval_runtime': 13.3964, 'eval_samples_per_second': 13.362, 'eval_steps_per_second': 1.717, 'epoch': 3.0}
{'loss': 0.0021, 'grad_norm': 0.14132265746593475, 'learning_rate': 1.3885350318471338e-05, 'epoch': 3.06}
{'loss': 0.0782, 'grad_norm': 0.41849642992019653, 'learning_rate': 1.375796178343949e-05, 'epoch': 3.12}
{'loss': 0.0756, 'grad_norm': 0.030596381053328514, 'learning_rate': 1.3630573248407644e-05, 'epoch': 3.18}
{'loss': 0.1024, 'grad_norm': 0.2364545613527298, 'learning_rate': 1.3503184713375796e-05, 'epoch': 3.25}
{'loss': 0.0686, 'grad_norm': 10.24595832824707, 'learning_rate': 1.337579617834395e-05, 'epoch': 3.31}
{'loss': 0.0034, 'grad_norm': 0.09166549146175385, 'learning_rate': 1.3248407643312102e-05, 'epoch': 3.38}
{'loss': 0.0143, 'grad_norm': 0.05140702798962593, 'learning_rate': 1.3121019108280256e-05, 'epoch': 3.44}
{'loss': 0.0618, 'grad_norm': 0.0338482148945331

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.2982326149940491, 'eval_accuracy': 0.9385474860335196, 'eval_f1': 0.9386373020313445, 'eval_runtime': 12.6975, 'eval_samples_per_second': 14.097, 'eval_steps_per_second': 1.811, 'epoch': 4.0}
{'train_runtime': 1654.825, 'train_samples_per_second': 7.548, 'train_steps_per_second': 0.949, 'train_loss': 0.17309945671089516, 'epoch': 4.0}


TrainOutput(global_step=628, training_loss=0.17309945671089516, metrics={'train_runtime': 1654.825, 'train_samples_per_second': 7.548, 'train_steps_per_second': 0.949, 'total_flos': 279199880304768.0, 'train_loss': 0.17309945671089516, 'epoch': 4.0})

# Testing

In [46]:
# Run inference on the test dataset
predictions = trainer.predict(tokenized_dataset["test"])
predicted_labels = predictions.predictions.argmax(axis=1)

pprint(predictions.metrics)

  0%|          | 0/45 [00:00<?, ?it/s]

{'test_accuracy': 0.9187675070028011,
 'test_f1': 0.9188484179811213,
 'test_loss': 0.23127254843711853,
 'test_runtime': 25.3598,
 'test_samples_per_second': 14.077,
 'test_steps_per_second': 1.774}


In [43]:
for text, true, predicted in zip(x_test_cleaned[:50], y_test, predicted_labels):
    print(f"True: {true}, predicted: {predicted}, text: {text}")

True: 0, predicted: 0, text: she is not tired
True: 1, predicted: 0, text: omg the cringe i cant
True: 0, predicted: 0, text: benjamenus oh blimey its not you is it dont worry i wouldnt say anything here that i wouldnt say to them personally
True: 1, predicted: 1, text: tsos tails or not if he wanted to kill him head be dead itachi couldave killed him but he didnat want toida 
True: 1, predicted: 1, text: its fucking cold i hate working at am tf did i sign up for why did i do this to myself damn it
True: 0, predicted: 0, text: priyankacharya jpnadda world has got the right path celebrating d matri pitri pujan diwas started by bapuji
True: 0, predicted: 0, text: i want to be a businessman in futer
True: 1, predicted: 1, text: so like if the man i end up spending the rest of my days with aint him i feel like im going to hate my life
True: 0, predicted: 0, text: shruticute hmmm my mom well thanni thelichu vettutaanga
True: 1, predicted: 1, text: aand iam so sick of love songs so tired of 

# SHAP

In [74]:
# Load model
classifier_pipeline = pipeline("sentiment-analysis", model=trainer.model, tokenizer=tokenizer)

# Define SHAP
explainer = shap.Explainer(classifier_pipeline)
samples = [
    "i just cannot see a way out of this dark place i want to end myself",
    "last few hours before monday ugh i hate working",
    "im tired of everything",
]
# Do inference
classifier_pipeline(samples)

Device set to use cpu


[{'label': 'Potential Suicide post', 'score': 0.9840720295906067},
 {'label': 'Not Suicide post', 'score': 0.5917704701423645},
 {'label': 'Potential Suicide post', 'score': 0.7530813813209534}]

In [89]:
 # Explain prediction of samples
shap_values = explainer(samples)

  0%|          | 0/306 [00:00<?, ?it/s]

PartitionExplainer explainer: 4it [00:12,  6.29s/it]               


In [90]:
shap.plots.text(shap_values[:, :, "Potential Suicide post"])