In [5]:
!pip3 install xgboost



In [6]:
import numpy as np 
import pandas as pd
import pickle
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, roc_auc_score, classification_report

import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

In [7]:
suicide_detection_df = pd.read_csv('Data/Suicide_Detection_Final.csv', header=0)
suicide_detection_df.replace({"class": {"suicide": 1, "non-suicide": 0}}, inplace=True)
suicide_detection_df.drop(columns=['text'], inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
suicide_detection_df    

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not get affected compliment come someone...
2,0,finally almost never hear bad year ever swear ...
3,1,need help help cry hard
4,1,end tonight can not anymore quit
...,...,...
174219,0,something today go sled friend may not seem li...
174220,0,not like rock not go get anything go
174221,0,tell many friend not lonely everything deprive...
174222,0,pee probably taste like salty tea someone drin...


In [8]:
X = suicide_detection_df['text']
y = suicide_detection_df['class']

X.shape, y.shape

((174224,), (174224,))

In [9]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,  random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)

(139379, 139379, 34845, 34845)

In [10]:
target_names = ['non suicidal', 'suicidal']

### First pipeline

In [129]:
pipe = Pipeline([
                ('bow', CountVectorizer()), 
                ('tfidf', TfidfTransformer()),  
                ('model', xgb.XGBClassifier(
                    learning_rate=0.1,
                    max_depth=7,
                    n_estimators=80,
                    use_label_encoder=False,
                    eval_metric='auc'))
                ])

In [132]:
pipe.fit(X_train, y_train)

In [74]:
y_pred = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print("pipe1: train accuracy %f, test accuray %f" % (accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)))

pipe1: train accuracy 0.899217, test accuray 0.887530


In [13]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

non suicidal       0.88      0.94      0.91     21449
    suicidal       0.90      0.80      0.84     13396

    accuracy                           0.89     34845
   macro avg       0.89      0.87      0.88     34845
weighted avg       0.89      0.89      0.89     34845



### Second pipeline

In [14]:
pipe2 = Pipeline([
                ('bow', CountVectorizer()), 
                ('tfidf', TfidfTransformer()),  
                ('model', xgb.XGBClassifier(
                    learning_rate=0.1,
                    max_depth=10,
                    n_estimators=100,
                    use_label_encoder=False,
                    eval_metric='auc'))
                ])

In [16]:
pipe2.fit(X_train, y_train)

In [75]:
y_pred = pipe2.predict(X_test)
y_pred_train = pipe2.predict(X_train)

print("pipe2: train accuracy %f, test accuray %f" % (accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)))

pipe2: train accuracy 0.921531, test accuray 0.897891


In [19]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

non suicidal       0.90      0.94      0.92     21449
    suicidal       0.90      0.83      0.86     13396

    accuracy                           0.90     34845
   macro avg       0.90      0.88      0.89     34845
weighted avg       0.90      0.90      0.90     34845



### Third pipeline

In [21]:
pipe3 = Pipeline([
                ('bow', CountVectorizer()), 
                ('tfidf', TfidfTransformer()),  
                ('model', xgb.XGBClassifier(
                    learning_rate=0.05,
                    max_depth=6,
                    n_estimators=100,
                    use_label_encoder=False,
                    eval_metric='auc',
                    subsample=0.8,
                    colsample_bytree=0.8))
                ])

In [22]:
pipe3.fit(X_train, y_train)

In [76]:
y_pred = pipe3.predict(X_test)
y_pred_train = pipe3.predict(X_train)

print("pipe3: train accuracy %f, test accuray %f" % (accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)))

pipe3: train accuracy 0.879867, test accuray 0.873698


In [31]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

non suicidal       0.86      0.94      0.90     21449
    suicidal       0.89      0.76      0.82     13396

    accuracy                           0.87     34845
   macro avg       0.88      0.85      0.86     34845
weighted avg       0.88      0.87      0.87     34845



### Fourth pipeline

In [11]:
pipe4 = Pipeline([
                ('bow', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('model', xgb.XGBClassifier(
                    learning_rate=0.05,
                    max_depth=8,
                    n_estimators=300,
                    use_label_encoder=False,
                    eval_metric='auc',
                    subsample=0.8,
                    reg_alpha=0.1,
                    reg_lambda=1,
                    colsample_bytree=0.8))
                ])


In [12]:
pipe4.fit(X_train, y_train)

In [13]:
y_pred = pipe4.predict(X_test)
y_pred_train = pipe4.predict(X_train)

print("pipe4: train accuracy %f, test accuray %f" % (accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)))

pipe4: train accuracy 0.918661, test accuray 0.901593


In [14]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

non suicidal       0.90      0.95      0.92     21449
    suicidal       0.90      0.83      0.87     13396

    accuracy                           0.90     34845
   macro avg       0.90      0.89      0.89     34845
weighted avg       0.90      0.90      0.90     34845



In [15]:
# Save the best model
pickle.dump(pipe4, open('Models/XGBoost_TfIDF.h5', 'wb'))