## Import libraries

In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

## Load the dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv')

## Reconnaissance

In [3]:
df.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


In [4]:
df['intention'].value_counts()

0    5121
1    3998
Name: intention, dtype: int64

## Clean the data

In [5]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-64oi7a8s
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-64oi7a8s
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.0.3-cp36-none-any.whl size=4514 sha256=e1fd41383c90de216997c73452e693c43f8055c8a0dc3a0450b77792adf3f6b0
  Stored in directory: /tmp/pip-ephem-wheel-cache-8kxmdj6r/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.0.3


In [6]:
import re
import preprocess_kgptalkie as ps

In [7]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [8]:
df['tweet'] = df['tweet'].apply(lambda x:get_clean(x))

In [9]:
df.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


## Preparing data for model

In [16]:
tfdf = TfidfVectorizer(max_features = 20000, ngram_range= (1, 3), analyzer = 'char')

In [17]:
X = tfdf.fit_transform(df['tweet'])
y = df['intention']

In [18]:
X.shape

(9119, 10440)

In [19]:
y.shape

(9119,)

## Train test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## MOdel build up

In [21]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Prediction and evaluation

In [22]:
y_pred = clf.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1060
           1       0.91      0.91      0.91       764

    accuracy                           0.93      1824
   macro avg       0.92      0.92      0.92      1824
weighted avg       0.93      0.93      0.93      1824



## Testing on real data

In [31]:
x = 'Currently, suicide has become an art. Art of hanging their entity so artistically'
x = get_clean(x)
vec = tfdf.transform([x])
clf.predict(vec)

array([1])