### Importações necessárias

In [149]:
import unidecode
import re
import string
import pandas as pd
import nltk
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from gensim.models import Word2Vec

#### Análise de dataset

In [2]:
df = pd.read_csv("Dataset_final.csv")

df.head()

Unnamed: 0,texts,labels
0,Como cometer suicídio sem dx a mãe triste,0
1,Estou nova demais p cometer esse suicídio,0
2,"""Nunca dirijo meu carro por cima de uma ponte ...",0
3,Nem imagino como seria minha vida sem esse gat...,0
4,A vida tá me testando para ver qual vou comet...,0


In [3]:
df.describe()

Unnamed: 0,labels
count,1063.0
mean,0.503293
std,0.500225
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1063 entries, 0 to 1062
Data columns (total 2 columns):
texts     1063 non-null object
labels    1063 non-null int64
dtypes: int64(1), object(1)
memory usage: 16.7+ KB


#### Limpeza de dados

In [5]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stemmer = nltk.stem.RSLPStemmer()

def clean_text(text):
    
    # Normalize to Lower
    text = text.lower()
    
    # Remove Accent
    text = unidecode.unidecode(text)
    
    # Split ponctuation
    text = ' '.join(re.findall(r"[\w]+|[" + string.punctuation + r"]",text))
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove Stopwords
#     text = ' '.join([word for word in text.split() if word not in stopwords])
    
    return text.strip()

In [6]:
new_df = df

new_df['texts'] = new_df['texts'].apply(clean_text)
new_df['lenght'] = new_df['texts'].str.split().str.len()

new_df = new_df.drop_duplicates()

In [7]:
new_df.describe()

Unnamed: 0,labels,lenght
count,1031.0,1031.0
mean,0.516004,10.636275
std,0.499986,6.765174
min,0.0,2.0
25%,0.0,6.0
50%,1.0,9.0
75%,1.0,13.0
max,1.0,57.0


In [8]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031 entries, 0 to 1062
Data columns (total 3 columns):
texts     1031 non-null object
labels    1031 non-null int64
lenght    1031 non-null int64
dtypes: int64(2), object(1)
memory usage: 32.2+ KB


In [9]:
texts = new_df['texts']
labels = new_df['labels']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)

#### Treinamento

In [129]:
parameters = {'gamma':['scale', 'auto']}
svc = svm.SVC(kernel='rbf', probability=True)

clf = GridSearchCV(svc, parameters, cv=10)

In [130]:
folds = StratifiedKFold(n_splits=10)

print("=== Treino com TF-IDF ===\n")
for i, (train, test) in enumerate(folds.split(X_train, Y_train)):
    
    vectorizer = CountVectorizer()
    train_data = vectorizer.fit_transform(X_train.iloc[train])
    
    clf = clf.fit(train_data, Y_train.iloc[train])
    
    test_data = vectorizer.transform(X_train.iloc[test])

    print(f"Fold {i} | Acurácia: {clf.score(test_data, Y_train.iloc[test]):.2f}%")

=== Treino com TF-IDF ===

Fold 0 | Acurácia: 0.80%
Fold 1 | Acurácia: 0.77%
Fold 2 | Acurácia: 0.78%
Fold 3 | Acurácia: 0.83%
Fold 4 | Acurácia: 0.86%
Fold 5 | Acurácia: 0.80%
Fold 6 | Acurácia: 0.84%
Fold 7 | Acurácia: 0.74%
Fold 8 | Acurácia: 0.72%
Fold 9 | Acurácia: 0.85%


In [143]:
wv_texts = [text.split() for text in new_df['texts']]
vocab = ' '.join(new_df['texts']).split()

embedding_size = 250
model = Word2Vec(size=embedding_size, window=3, min_count=1, workers=4, sg=0)
model.build_vocab(wv_texts, progress_per=10000)


model.train(wv_texts, total_examples=model.corpus_count, epochs=30)

(214969, 328980)

In [144]:
model.wv.most_similar(positive=["morrer"])

[('so', 0.9992678165435791),
 ('ultimamente', 0.9985920190811157),
 ('quero', 0.9985347986221313),
 ('deus', 0.9984744191169739),
 ('posso', 0.9983900189399719),
 ('sera', 0.9983232021331787),
 ('deitar', 0.9983208179473877),
 ('eu', 0.9982663989067078),
 ('chorar', 0.9982332587242126),
 ('sono', 0.9981982707977295)]

In [145]:
model.wv.most_similar(positive=["desisto"])

[('tudo', 0.9971909523010254),
 ('de', 0.9957691431045532),
 ('serio', 0.995643138885498),
 ('juro', 0.9954697489738464),
 ('ai', 0.9950079321861267),
 ('amorosa', 0.9949986338615417),
 ('especie', 0.9948625564575195),
 ('olha', 0.9947174787521362),
 ('literalmente', 0.9947168827056885),
 ('namoral', 0.9945694208145142)]

In [146]:
model.wv.most_similar(positive=["cansei"])

[('solteira', 0.9961459040641785),
 ('de', 0.9960996508598328),
 ('serio', 0.9958281517028809),
 ('estudante', 0.9952219724655151),
 ('sofrida', 0.9951193332672119),
 ('affs', 0.9946573376655579),
 ('ai', 0.9945566654205322),
 ('desanimo', 0.9945402145385742),
 ('conversar', 0.994526207447052),
 ('dormi', 0.9944660067558289)]

In [147]:
index2word = model.wv.index2word

def avg_feat_vector(sentece):
    words = sentece.split()
    feat_vec = np.zeros((embedding_size, ), dtype="float")
    n_words = 0
    
    for word in words:
        if word in index2word:
            n_words += 1
            feat_vec = np.add(feat_vec, model.wv.__getitem__(word))
    if (n_words > 0):
        feat_vec = np.divide(feat_vec, n_words)
    
    return feat_vec

In [148]:
print("=== Treino com Word2Vec ===\n")
for i, (train, test) in enumerate(folds.split(X_train, Y_train)):
    
    train_data = X_train.iloc[train].apply(avg_feat_vector)
    
    clf = clf.fit(list(train_data), Y_train.iloc[train])
    
    test_data = list(X_train.iloc[test].apply(avg_feat_vector))

    print(f"Fold {i} | Acurácia: {clf.score(test_data, Y_train.iloc[test]):.2f}%")

=== Treino com Word2Vec ===

Fold 0 | Acurácia: 0.80%
Fold 1 | Acurácia: 0.66%
Fold 2 | Acurácia: 0.72%
Fold 3 | Acurácia: 0.70%
Fold 4 | Acurácia: 0.65%
Fold 5 | Acurácia: 0.70%
Fold 6 | Acurácia: 0.58%
Fold 7 | Acurácia: 0.54%
Fold 8 | Acurácia: 0.71%
Fold 9 | Acurácia: 0.66%
