In [1]:
#Import libraries
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics #for accuracy calculation
#import os
#import pandas as pd 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit

import medical_text_classifier

from nltk.corpus import stopwords
from string import punctuation

import matplotlib.pyplot as plt
import seaborn as sns

### 1. Load dataset

In [2]:
# Load Data
path = "../data/raw_test/"
df = medical_text_classifier.format_dataset(path)
df.head()

Unnamed: 0,file_name,category,text
0,29.txt,neurologia,Introdução. Após um episódio de Acidente Vascu...
1,15.txt,neurologia,A Doença de Parkinson é considerada uma patolo...
2,14.txt,neurologia,A psiquiatria e a neurologia são especialidad...
3,28.txt,neurologia,Objetivo. Caracterizar o perfil da postura cor...
4,16.txt,neurologia,"Após o acidente vascular cerebral (AVC), a rec..."


### 2. Preprocessing

In [3]:
## TO-DO
## Em text_preprocess criar um parametro que recebe um array settando as tecnicas de pre-processamento
## Transformar categorias de string para numero no dataframe

df = medical_text_classifier.text_preprocess(df[['category', 'text']].values)

In [4]:
print(df.shape)
df.groupby('category').count()

(112, 2)


Unnamed: 0_level_0,text
category,Unnamed: 1_level_1
neurologia,56
pediatria,56


### 3. Split train and test dataset

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['category'].values, 
                                                    test_size=0.20,
                                                    random_state=10)

In [8]:
print('Tamanho do dataset de treino: {}'.format(len(y_train)))
print('Tamanho do dataset de teste: {}'.format(len(y_test)))

Tamanho do dataset de treino: 89
Tamanho do dataset de teste: 23


### 4. Text enconding

In [11]:
stopword = medical_text_classifier.get_stopwords()

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=(1,2),
                        stop_words=stopword,
                        max_df=0.8,
                        max_features=100)
                        
features_train = tfidf.fit_transform(x_train)
labels_train = y_train

features_test = tfidf.transform(x_test)
labels_test = y_test

(89, 100)
(23, 100)


In [None]:
print(features_train.shape)
print(features_test.shape)

### 5. Model

In [14]:
C = [.001, .01, .1, 1, 10]
degree = [1, 2, 3, 4, 5]
gamma = [0.001, 0.01, 0.1, 1, 10, 100]
probability = [True, False]
kernels = ['linear', 'poly', 'rbf']

param_grid = {'C': C, 'kernel':kernels, 'probability':probability, 'gamma':gamma, 'degree':degree}

svc = svm.SVC(random_state=8)

grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=10,
                           verbose=1)

grid_search.fit(features_train, labels_train)

Fitting 10 folds for each of 900 candidates, totalling 9000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 9000 out of 9000 | elapsed:   47.8s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'degree': [1, 2, 3, 4, 5],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                         'kernel': ['linear', 'poly', 'rbf'],
                         'probability': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [15]:
print("Melhores hiper parametros:")
print(grid_search.best_params_)
print("Acurácia:")
print(grid_search.best_score_)

Melhores hiper parametros:
{'C': 0.001, 'degree': 2, 'gamma': 100, 'kernel': 'poly', 'probability': True}
Acurácia:
0.8426966292134831


In [16]:
best_model = grid_search.best_estimator_
best_model.fit(features_train, labels_train)

SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma=100, kernel='poly',
    max_iter=-1, probability=True, random_state=8, shrinking=True, tol=0.001,
    verbose=False)

In [18]:
y_predict = best_model.predict(features_test)
print("Acurácia: {}".format(accuracy_score(labels_test, y_predict)))
print(classification_report(labels_test,y_predict))
print(confusion_matrix(labels_test, y_predict, labels=['pediatria', 'neurologia']))

Acurácia: 0.8695652173913043
              precision    recall  f1-score   support

  neurologia       0.93      0.88      0.90        16
   pediatria       0.75      0.86      0.80         7

    accuracy                           0.87        23
   macro avg       0.84      0.87      0.85        23
weighted avg       0.88      0.87      0.87        23

[[ 6  1]
 [ 2 14]]


###  com parâmetros pré-definidos

Para rodar esses resultados, é necessário setar o parametro ind_stemmer em notebooks/medical_text_classifier.py como 1 (com stemmer) ou 0 (sem stemmer) e rodar novamente o código

In [21]:
print('Sem stemmer')
best_model = svm.SVC(kernel='linear', C=.3, probability=True)
best_model.fit(features_train, labels_train)
y_predict = best_model.predict(features_test)

print('C=0.3')
print("Acurácia: {}".format(accuracy_score(labels_test, y_predict)))
print(classification_report(labels_test,y_predict))
print(confusion_matrix(labels_test, y_predict, labels=['pediatria', 'neurologia']))

best_model = svm.SVC(kernel='linear', C=1.0, probability=True)
best_model.fit(features_train, labels_train)
y_predict = best_model.predict(features_test)

print('\nC=1.0')
print("Acurácia: {}".format(accuracy_score(labels_test, y_predict)))
print(classification_report(labels_test,y_predict))
print(confusion_matrix(labels_test, y_predict, labels=['pediatria', 'neurologia']))

Sem stemmer
C=0.3
Acurácia: 0.6086956521739131
              precision    recall  f1-score   support

  neurologia       1.00      0.44      0.61        16
   pediatria       0.44      1.00      0.61         7

    accuracy                           0.61        23
   macro avg       0.72      0.72      0.61        23
weighted avg       0.83      0.61      0.61        23

[[7 0]
 [9 7]]

C=1.0
Acurácia: 0.8260869565217391
              precision    recall  f1-score   support

  neurologia       0.93      0.81      0.87        16
   pediatria       0.67      0.86      0.75         7

    accuracy                           0.83        23
   macro avg       0.80      0.83      0.81        23
weighted avg       0.85      0.83      0.83        23

[[ 6  1]
 [ 3 13]]


In [12]:
print('Com stemmer')
best_model = svm.SVC(kernel='linear', C=.3, probability=True)
best_model.fit(features_train, labels_train)
y_predict = best_model.predict(features_test)

print('C=0.3')
print("Acurácia: {}".format(accuracy_score(labels_test, y_predict)))
print(classification_report(labels_test,y_predict))
print(confusion_matrix(labels_test, y_predict, labels=['pediatria', 'neurologia']))

best_model = svm.SVC(kernel='linear', C=1.0, probability=True)
best_model.fit(features_train, labels_train)
y_predict = best_model.predict(features_test)

print('\nC=1.0')
print("Acurácia: {}".format(accuracy_score(labels_test, y_predict)))
print(classification_report(labels_test,y_predict))
print(confusion_matrix(labels_test, y_predict, labels=['pediatria', 'neurologia']))

Com stemmer
C=0.3
Acurácia: 0.5652173913043478
              precision    recall  f1-score   support

  neurologia       0.88      0.44      0.58        16
   pediatria       0.40      0.86      0.55         7

    accuracy                           0.57        23
   macro avg       0.64      0.65      0.56        23
weighted avg       0.73      0.57      0.57        23

[[6 1]
 [9 7]]

C=1.0
Acurácia: 0.8260869565217391
              precision    recall  f1-score   support

  neurologia       0.93      0.81      0.87        16
   pediatria       0.67      0.86      0.75         7

    accuracy                           0.83        23
   macro avg       0.80      0.83      0.81        23
weighted avg       0.85      0.83      0.83        23

[[ 6  1]
 [ 3 13]]
