# PROJET: DETECTION DE TWEET A CARACTERE SUICIDAIRE

Le but de se projet est de développer un algorithme capable de prédire des tweets suicidaires.

In [109]:
import pandas as pd

import numpy as np

import spacy

In [110]:
nlp = spacy.load('en_core_web_lg')

# 1. Chargement des données

In [111]:
tweet = pd.read_csv('/home/paul/Projets/Datasets/suicide/Suicide_Ideation_Dataset(Twitter-based).csv')

In [112]:
print(tweet.head(10))

                                               Tweet                  Suicide
0                                  making some lunch         Not Suicide post
1                        @Alexia You want his money.         Not Suicide post
2  @dizzyhrvy that crap took me forever to put to...  Potential Suicide post 
3  @jnaylor #kiwitweets Hey Jer! Since when did y...         Not Suicide post
4  Trying out &quot;Delicious Library 2&quot; wit...         Not Suicide post
5  @ValenValdez Oh, that's good to hear. But is i...         Not Suicide post
6  @mcm180 u've got a list for fellow #hotties? Y...         Not Suicide post
7  @jakepaul @jamescharles @LoganPaul Just becaus...  Potential Suicide post 
8          time for some warsaw beer garden chilling         Not Suicide post
9  I hate my life lmao I hope I die soon or sumn ...  Potential Suicide post 


In [113]:
tweet.Suicide.unique()

array(['Not Suicide post', 'Potential Suicide post '], dtype=object)

In [114]:
tweet['Tweet'] = tweet['Tweet'].str.lower()

In [115]:
tweet['label'] = tweet['Suicide'].map({'Not Suicide post':0,"Potential Suicide post ":1})

In [116]:
print(tweet['label'])

0       0
1       0
2       1
3       0
4       0
       ..
1782    0
1783    0
1784    0
1785    0
1786    0
Name: label, Length: 1787, dtype: int64


In [117]:
tweet['Suicide'].value_counts()

Suicide
Not Suicide post           1127
Potential Suicide post      660
Name: count, dtype: int64

In [118]:
tweet.isnull().sum()

Tweet      2
Suicide    0
label      0
dtype: int64

In [119]:
tweet.isnull().sum()

Tweet      2
Suicide    0
label      0
dtype: int64

In [120]:
tweet = tweet.dropna()

In [121]:
tweet.isnull().sum()

Tweet      0
Suicide    0
label      0
dtype: int64

**Suppression des caractères inutiles et de la ponctuation**

In [122]:
import re

***Test***

In [123]:
pattern = r"[^\w\s]"

text = "I am free #."

re.sub(pattern, "", text)

'I am free '

In [124]:
def remove_no_word(text):

    pattern = r"[^\w\s]"

    return re.sub(pattern, "", text)

In [125]:
tweet['Tweet_clean'] = tweet['Tweet'].apply(remove_no_word)

tweet['Tweet_clean'] = tweet['Tweet_clean'].str.lower()

In [126]:
print(tweet['Tweet_clean'])

0                                       making some lunch
1                               alexia you want his money
2       dizzyhrvy that crap took me forever to put tog...
3       jnaylor kiwitweets hey jer since when did you ...
4       trying out quotdelicious library 2quot with mi...
                              ...                        
1782       i have forgotten how much i love my nokia n951
1783    starting my day out with a positive attitude t...
1784    belledame222 hey its 5 amgive a girl some cred...
1785    2 drunken besties stumble into my room and we ...
1786    dancingbonita quoti friggin love youquot ron b...
Name: Tweet_clean, Length: 1785, dtype: object


**Tokenization, suppression des stops words et lemmatization**

In [127]:
def tokanize_stop_word_lemmatize(text):

    doc = nlp(text)

    return [word.lemma_ for word in doc if not word.is_stop]

In [128]:
tweet['Tweet_clean'] = tweet['Tweet_clean'].apply(tokanize_stop_word_lemmatize)

In [129]:
print(tweet['Tweet_clean'][0])

['make', 'lunch']


**Embedding**

In [130]:
def embedding(text):

    doc = nlp(" ".join(text))

    return doc.vector

In [131]:
embedding(tweet['Tweet_clean'][0]).shape

(300,)

In [132]:
tweet['Tweet_clean'] = tweet['Tweet_clean'].apply(embedding)

In [133]:
print(tweet['Tweet_clean'])

0       [-3.364215, 1.01506, -0.6452, -3.24055, 1.2228...
1       [0.051306564, 2.21501, -4.7852464, -0.50859994...
2       [1.0736338, 1.7380373, -3.4406252, -0.31427503...
3       [0.7017867, 0.46629003, -0.220475, 0.38841334,...
4       [1.0777183, 1.4754522, -0.45207277, 0.8051967,...
                              ...                        
1782    [0.69080245, -0.311, -1.6120825, -2.5361698, -...
1783    [0.34875855, 1.8456343, -2.3566685, -0.61416, ...
1784    [1.1211486, -1.4620672, -2.6991572, -0.2523457...
1785    [0.41769776, -1.4369525, -0.38016158, 2.233723...
1786    [-0.030727778, 0.15775912, -1.76967, -1.003467...
Name: Tweet_clean, Length: 1785, dtype: object


# 2. Entrainement 

In [141]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [139]:
X_train, X_test, y_train, y_test = train_test_split(np.stack(tweet.Tweet_clean.values), tweet.label, train_size= 0.8)

# 2.1 Modèle 1: Boosting

In [151]:
model1 = XGBClassifier()

model1.fit(X_train, y_train)

print(f"Accuracy: {model1.score(X_test, y_test)}")

Accuracy: 0.9047619047619048


**Grille de recherche des meilleurs hyperparametres**

In [158]:
from sklearn.model_selection import GridSearchCV

model1 = XGBClassifier()

grille_param = {'n_estimators': [300], 'learning_rate': [0.01,0.1], 'max_depth': [5,10], 'min_child_weight': [1]}

grille = GridSearchCV(model1, param_grid= grille_param, cv = 5)

grille.fit(X_train, y_train)

model1_best = grille.best_estimator_

model1_best.fit(X_train, y_train)

print(f"Meilleur modèle: {grille.best_params_} <-> Score de validation: {grille.best_score_} <-> Score sur le train: {model1_best.score(X_test, y_test)}")

Meilleur modèle: {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 300} <-> Score de validation: 0.8823530855109801 <-> Score sur le train: 0.9187675070028011


**Métriques avancées**

In [165]:
y_pred = model1_best.predict(X_test)

print(classification_report(y_test, y_pred, target_names= ['Not Suicide post',"Potential Suicide post"]))

                        precision    recall  f1-score   support

      Not Suicide post       0.92      0.96      0.94       229
Potential Suicide post       0.92      0.84      0.88       128

              accuracy                           0.92       357
             macro avg       0.92      0.90      0.91       357
          weighted avg       0.92      0.92      0.92       357



**Exemple de prédiction**

In [244]:
def predict_suicide(text):

    doc = remove_no_word(text)

    doc = tokanize_stop_word_lemmatize(text)

    doc = embedding(doc)

    result = (model1_best.predict(doc.reshape(1,-1)))

    if result == 0:

        print("Not suicide post")

    else:

        print("Potential Suicide post")


predict_suicide("I tired of this life")

Potential Suicide post


## 2.2 Modèle 1: Les machines à vecteurs supports

In [None]:
model1 = SVC(kernel= 'rbf', C= 2)

model1.fit(X_train, y_train)

model1.score(X_test, y_test)

0.9159663865546218

In [245]:
def predict_suicide(text):

    doc = remove_no_word(text)

    doc = tokanize_stop_word_lemmatize(text)

    doc = embedding(doc)

    result = (model1.predict(doc.reshape(1,-1)))

    if result == 0:

        print("Not suicide post")

    else:

        print("Suicide post")


predict_suicide("I'm tired of this life")

Suicide post
