In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("archive/twitter-suicidal_data.csv")
data.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


In [3]:
# no null values
data.isnull().sum()

tweet        0
intention    0
dtype: int64

In [4]:
len(data[data["intention"] == 1]),len(data[data["intention"] == 0])
# the dataset is balanced

(3998, 5121)

In [5]:
data[data["intention"] == 0]

Unnamed: 0,tweet,intention
3325,i wish i got to watch it with you i miss you ...,0
3326,i want to go to promote gear and groove but u...,0
3327,oh manwas ironing fave top to wear to a meetin...,0
3328,sadly though i ve never gotten to experience t...,0
3329,wonders why someone that u like so much can ma...,0
...,...,...
9103,if you want you can always talk to me,0
9104,people don t die from suicide they die from sa...,0
9106,she finally let go of her fake smile and tears...,0
9108,wil could ever love the girl with scars,0


In [6]:
import string
import nltk
import re
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

In [9]:
def clean_text(text):
    cleaned_text = []
    for i in range(len(text)):
        words = text[i].split()
        translator = str.maketrans('', '', string.punctuation)
        words = [w.translate(translator) for w in words]
        words = [wl.lemmatize(w.lower()) for w in words if not w in stopwords.words('english')]
        sentence = " ".join(words)
        sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
        cleaned_text.append(sentence)
    return np.array(cleaned_text)

In [10]:
x = np.array(data["tweet"])
x = clean_text(x)

In [12]:
y = np.array(data["intention"])
y[1]

1

In [13]:
x.shape, y.shape

((9119,), (9119,))

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [15]:
tokenizer = Tokenizer(oov_token='<unknown>')
tokenizer.fit_on_texts(x)

In [16]:
vocab_size = len(tokenizer.word_index) + 1
print(f'the vocab size is {vocab_size}')

the vocab size is 22624


In [17]:
x = tokenizer.texts_to_sequences(x)
x[1]

[9876,
 208,
 14,
 613,
 70,
 5,
 305,
 254,
 2917,
 10,
 50,
 378,
 109,
 131,
 329,
 91,
 47,
 319,
 218,
 409,
 5]

In [18]:
max_lenghth = 0
index = None
for i in range(len(x)):
    if (len(x[i]) > max_lenghth):
        max_lenghth = len(x[i])
        index = i
print(f"the maximum length of a sentence is {max_lenghth} in the index {index}")

the maximum length of a sentence is 2147 in the index 6710


In [19]:
data.iloc[6710]

tweet        my diary log as of so far okay my first time w...
intention                                                    1
Name: 6710, dtype: object

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# we will make the max_length 100 so it learn faster
max_lenghth = 100
x = pad_sequences(x, padding='post',maxlen=max_lenghth)

In [21]:
x[1]

array([9876,  208,   14,  613,   70,    5,  305,  254, 2917,   10,   50,
        378,  109,  131,  329,   91,   47,  319,  218,  409,    5,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [22]:
x.shape

(9119, 100)

In [23]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [24]:
model = Sequential()

In [25]:
model.add(Embedding(vocab_size, 50, input_length=x.shape[1]))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

In [26]:
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =  train_test_split(x, y, test_size=0.25)

In [46]:
model.fit(X_train, y_train, epochs=20, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x29993a04df0>

In [47]:
model.evaluate(X_test,y_test)



[0.3269117772579193, 0.8960526585578918]

In [48]:
model.save("model.h5")
model.save_weights("model.weights.h5")

In [49]:
def predict(text):
    text = clean_text(text)
    text = tokenizer.texts_to_sequences(text)
    text = pad_sequences(text, padding='post',maxlen=max_lenghth)
    prediction = model.predict(text)
    if prediction > 0.5:
        print("this is a suicidal text")
    else:
        print("this is not a suicidal text")
        

In [50]:
test_string = ['i feel like no one cares i just want to die i am not happy']
predict(test_string)

this is a suicidal text


In [51]:
test_string2 = ['messi is leaving to paris and this is very sad for me i want him to stay']
predict(test_string2)

this is not a suicidal text


In [52]:
test_string3 = ['the last month i overdose i was near dead']
predict(test_string3)

this is a suicidal text


In [53]:
test_string4 = ['the last month i played football and hurt my self for 2 weeks']
predict(test_string4)

this is not a suicidal text
