In [5]:
#Import libraries
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics #for accuracy calculation
#import os
#import pandas as pd 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit

import medical_text_classifier

### 1. Load dataset

In [6]:
# Load Data
path = "../data/raw_test/"
df = medical_text_classifier.format_dataset(path)
df.head()

Unnamed: 0,file_name,category,text
0,15.txt,neurologia,A Doença de Parkinson é considerada uma patolo...
1,14.txt,neurologia,A psiquiatria e a neurologia são especialidad...
2,16.txt,neurologia,"Após o acidente vascular cerebral (AVC), a rec..."
3,13.txt,neurologia,nfectologia: Desenvolve habilidades para soluç...
4,12.txt,neurologia,A epilepsia é a mais prevalente doença neuroló...


### 2. Preprocessing

In [7]:
## TO-DO
## Em text_preprocess criar um parametro que recebe um array settando as tecnicas de pre-processamento
## Transformar categorias de string para numero no dataframe

df = medical_text_classifier.text_preprocess(df[['category', 'text']].values)
enumerated_categories = medical_text_classifier.enumerate_category(path)

In [9]:
print(enumerated_categories)
df.head()

{'neurologia': 1, 'geriatria': 2, 'pediatria': 3}


Unnamed: 0,category,text
0,neurologia,doenc parkinson consider patolog neurodegener ...
1,neurologia,psiquiatr neurolog especial entrelacadas. avan...
2,neurologia,"apo acid vascul cerebr (avc), recuperaca funca..."
3,neurologia,nfectologia: desenvolv habil soluca adequ situ...
4,neurologia,"epileps preval doenc neurolog cronica, acomet ..."


### 3. Split train and test dataset

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df['text'].values, 
                                                    df['category'].values, 
                                                    test_size=0.10,
                                                    random_state=10)

In [12]:
print('Tamanho do dataset de treino: {}'.format(len(y_train)))
print('Tamanho do dataset de teste: {}'.format(len(y_test)))

Tamanho do dataset de treino: 99
Tamanho do dataset de teste: 12


### text enconding

In [101]:
# Parameter selection
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 200

In [102]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(x_train)
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(x_test)
labels_test = y_test
print(features_test.shape)

(99, 197)
(12, 197)


In [103]:
from sklearn.feature_selection import chi2
import numpy as np

for description, category_id in sorted(label_code.items()):
    features_chi2 = chi2(features_train, labels_train == description)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(description))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'geriatria' category:
  . Most correlated unigrams:
. prevencao
. se
. area
. trabalho
. idosos
  . Most correlated bigrams:
. ic 95
. criancas adolescentes

# 'neurologia' category:
  . Most correlated unigrams:
. presente
. retrospectivo
. sendo
. se
. clinico
  . Most correlated bigrams:
. ic 95
. criancas adolescentes

# 'pediatria' category:
  . Most correlated unigrams:
. adolescentes
. relacionadas
. criancas
. se
. idosos
  . Most correlated bigrams:
. ic 95
. criancas adolescentes



In [104]:
len(unigrams)

190

In [105]:
[]

[]

In [106]:
print(unigrams)

['quais', 'analises', 'media', 'literatura', 'tempo', 'menos', 'embora', 'parte', 'clinicas', 'anos', 'analise', 'achados', 'outras', 'ate', '19', 'individuos', 'forma', 'identificar', 'cada', 'sintomas', 'objetivos', 'qualidade', 'porem', 'acompanhamento', 'geral', '30', 'periodo', 'durante', 'menores', 'metodos', 'sindrome', 'investigar', 'grande', 'resultado', 'prevalencia', 'teste', 'assim', 'mostrou', 'estudo', 'fator', 'realizada', 'pacientes', 'sexo', 'causa', 'atividade', 'correlacao', 'conclusao', 'apenas', 'ambos', 'paciente', 'doenca', 'saude', 'atraves', 'outros', 'medidas', 'devem', 'relacao', 'apos', 'caracteristicas', 'uso', 'base', 'controle', 'tratamento', 'numero', 'doencas', 'disso', 'presenca', 'significativa', 'quanto', 'intervencoes', 'ou', 'questionario', 'inicio', 'eficacia', 'presente', 'casos', 'tipo', 'conhecimento', 'feito', 'acordo', 'ser', 'avaliar', 'total', 'diagnostico', 'sobre', 'conclusoes', 'variaveis', 'frequencia', 'devido', '60', 'intervencao', 'm

### svm model

In [107]:
svc_0 =svm.SVC(random_state=8)

print('Parameters currently in use:\n')
print(svc_0.get_params())

Parameters currently in use:

{'C': 1.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto_deprecated', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 8, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [108]:
# C
C = [.0001, .001, .01]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# probability
probability = [True]

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

print(random_grid)

{'C': [0.0001, 0.001, 0.01], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'degree': [1, 2, 3, 4, 5], 'probability': [True]}


In [109]:
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    0.9s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=8, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=50, n_jobs=None,
                   param_distributions={'C': [0.0001, 0.001, 0.01],
                                        'degree': [1, 2, 3, 4, 5],
                                        'gamma': [0.0001, 0.001, 0.01, 0.1, 1,
                                                  10, 100],
                                        'kernel': ['linear', 'rbf', 'poly'],
                                        'probability': [True]},
                   pre_dispatch='2*n_jobs', random

In [110]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

The best hyperparameters from Random Search are:
{'probability': True, 'kernel': 'poly', 'gamma': 0.001, 'degree': 4, 'C': 0.01}

The mean accuracy of a model with these hyperparameters is:
0.6868686868686869


In [111]:
# Create the parameter grid based on the results of random search 
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    0.7s finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='wa...one,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability': [True]}],
             pre_

In [112]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Grid Search are:
{'C': 0.0001, 'kernel': 'linear', 'probability': True}

The mean accuracy of a model with these hyperparameters is:
0.7070707070707071


In [117]:
best_svc = grid_search.best_estimator_

In [118]:
best_svc.fit(features_train, labels_train)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=8,
    shrinking=True, tol=0.001, verbose=False)

In [119]:
svc_pred = best_svc.predict(features_test)

In [120]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, svc_pred))

The training accuracy is: 
0.6868686868686869
The test accuracy is: 
0.9166666666666666


In [125]:
df

[['15.txt',
  'neurologia',
  'A Doença de Parkinson é considerada uma patologia neurodegenerativa que afeta principalmente idosos, podendo ser manifestada de outras formas em indivíduos mais jovens, sendo caracterizada pela diminuição de produção de dopamina resultando em tremores involuntários, bradicinesia e perda de equilíbrio. O diagnóstico da doença é complexo e é realizado basicamente pelo quadro clínico do paciente. A detecção do Parkinson de forma precoce é um desafio relevante, o que gerou novos estudos e desenvolvimento de novas ferramentas de diagnóstico para prever a doença e impedir o seu avanço. As técnicas de imagem são exames importantes que podem ser aplicados para o estadiamento do indivíduo. Este trabalho consiste em uma revisão bibliográfica narrativa, com o objetivo de apresentar o uso de técnicas de medicina nuclear capazes de identificar a patologia de forma precoc'],
 ['14.txt',
  'neurologia',
  ' A psiquiatria e a neurologia são especialidades entrelaçadas. O

In [126]:
df_ = transform_to_pandas_dataframe(df)

In [127]:
df_

Unnamed: 0,file_name,category,content
0,15.txt,neurologia,A Doença de Parkinson é considerada uma patolo...
1,14.txt,neurologia,A psiquiatria e a neurologia são especialidad...
2,16.txt,neurologia,"Após o acidente vascular cerebral (AVC), a rec..."
3,13.txt,neurologia,nfectologia: Desenvolve habilidades para soluç...
4,12.txt,neurologia,A epilepsia é a mais prevalente doença neuroló...
5,Texto08.txt,neurologia,Identificar alterações nos músculos paraverteb...
6,Texto09.txt,neurologia,A crise convulsiva inaugural é uma causa frequ...
7,Texto07.txt,neurologia,"A epilepsia do lobo temporal (ELT), além de se..."
8,Texto06.txt,neurologia,A Reserva Cognitiva (RC) enquanto capacidade d...
9,Texto04.txt,neurologia,A prevalência de esclerose múltipla (EM) no Br...


In [131]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#aux_df = df_.category.unique().sort_values('Category_Code')
conf_matrix = confusion_matrix(labels_test, svc_pred)
plt.figure(figsize=(12.8,6))
sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=df_.category.unique(), 
            yticklabels=df_.category.unique(),
            cmap="Blues")
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix')
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'category'

<Figure size 921.6x432 with 0 Axes>

In [115]:
base_model = svm.SVC(random_state = 8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))



0.9166666666666666

In [116]:
best_svc.fit(features_train, labels_train)
accuracy_score(labels_test, best_svc.predict(features_test))

NameError: name 'best_svc' is not defined

### SVM example

In [2]:
#Load dataset
cancer = datasets.load_breast_cancer()

In [3]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test

In [4]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [5]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.9649122807017544
Precision: 0.9811320754716981
Recall: 0.9629629629629629
