In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
import nltk

In [3]:
df_validation = pd.read_csv('./dataset_val_v0.csv')

# Carregando os dados
df = pd.read_csv("dataset_v0.csv", sep=';')

# Removendo as instâncias de validação
df = df[~df['id_sms'].isin(df_validation['id_sms'])].copy()
print("Conjunto de treino e teste: ",df.shape)
print("Conjunto de validação: ",df_validation.shape)

Conjunto de treino e teste:  (14708, 3)
Conjunto de validação:  (3000, 3)


In [4]:
df = df.dropna(subset=['sms'])

In [5]:
nltk.download('stopwords')
nltk.download('rslp')

# Stemmer em português
stemmer = RSLPStemmer()
def stemmize(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

# Obtendo stopwords em português
portuguese_stopwords = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/david/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [6]:
# Removendo stop words e aplicando stemming
df['sms'] = df['sms'].apply(lambda x: ' '.join([stemmize(word) for word in x.split() if word not in (portuguese_stopwords)]))

In [7]:
# Definindo o vetorizador
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['sms'])

# Definindo os rótulos
y = df['response']

In [8]:
# Dividindo o conjunto em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Aplicando o SMOTE no conjunto de treinamento
smote = SMOTE(random_state=123)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [10]:
# Verificando o balanceamento
print(pd.Series(y_train_smote).value_counts())

response
0    10643
1    10643
Name: count, dtype: int64


In [11]:
# Definindo o pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(use_idf=False)),     # Experimentando desligando o uso de IDF
    ('clf', svm.SVC(probability=True)),             # SVM com probabilidade ativada
])

In [12]:
# Treinando o modelo
pipeline.fit(X_train_smote, y_train_smote)

In [13]:
# Avaliando o modelo
y_pred = pipeline.predict(X_test)

In [14]:
# Imprimindo o relatório de classificação
print(classification_report(y_test, y_pred))
# Imprimindo a matriz de confusão
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2654
           1       0.64      0.58      0.61       285

    accuracy                           0.93      2939
   macro avg       0.80      0.77      0.78      2939
weighted avg       0.92      0.93      0.93      2939

[[2560   94]
 [ 120  165]]


In [16]:
# Validação cruzada
scores = cross_val_score(pipeline, X_train_smote, y_train_smote, cv=5)  # 5 é o número de folds na validação cruzada
print("Acurácia de validação cruzada: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Acurácia de validação cruzada: 0.94 (+/- 0.10)
