In [74]:
#Import libraries
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics #for accuracy calculation
import os
import pandas as pd 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import RSLPStemmer
from unicodedata import normalize

### create dataframe

In [42]:
#### The final dataframes columns is: file_name, category and content
def create_dataframe(path):
    # ignore hidden files
    categories = [f for f in os.listdir(path) if not f.startswith('.')]
    dataset = []
    for category in categories:
        for abstract in os.listdir(path+category):
            if not abstract.startswith("."):
                text = ""
                row = []
                with open("{}/{}".format(path+category, abstract)) as f: 
                        for line in f: 
                            text += line 
                row.append(abstract) #file_name
                row.append(category) #category
                row.append(text)
                dataset.append(row)          
    return dataset

def transform_to_pandas_dataframe(matrix, column_names=["file_name", "category", "content"]):
    return pd.DataFrame(matrix, columns = column_names)

### text data preprocessing

In [82]:
def tokenizer(text):  #input = string
    text = word_tokenize(text)
    return text

def remove_stopwords(text): #input = list
    stopword = set(stopwords.words('portuguese') + list(punctuation))
    text = [word for word in text if word not in stopword]
    return text

def stemmer(text):
    stemmer = RSLPStemmer()
    new_text = []
    for word in text:
        new_text.append(stemmer.stem(word.lower()))
    return new_text

def remove_special_character(text):
    text = normalize('NFKD', text).encode('ASCII','ignore').decode('ASCII')
    return text

- Remover acentuação
- Colocar texto em minusculo
- Transformar em tokens
- Remover stop words
- Lemmatizations

### main

In [84]:
path = "../data/raw/"
df = create_dataframe(path)

In [89]:
output = []
text_position = 2
category_position = 1

ind_lower = True
ind_remove_special_character = True
ind_tokenizer = True
ind_remove_stopwords = True
ind_stemmer = True

for row in df:
    text = row[text_position]
    category = row[category_position]
    
    if ind_lower:
        # input type: string
        text = text.lower()
    
    if ind_remove_special_character:
        # input type: string
        text = remove_special_character(text)
    
    if ind_tokenizer:
        # input type: string
        text = tokenizer(text)
    
    if ind_remove_stopwords:
        # input type: list
        text = remove_stopwords(text)
        
    if ind_stemmer:
        # input type: list
        text = stemmer(text)
        
    output.append([category, text])

In [91]:
df[0]

['15.txt',
 'neurologia',
 'A Doença de Parkinson é considerada uma patologia neurodegenerativa que afeta principalmente idosos, podendo ser manifestada de outras formas em indivíduos mais jovens, sendo caracterizada pela diminuição de produção de dopamina resultando em tremores involuntários, bradicinesia e perda de equilíbrio. O diagnóstico da doença é complexo e é realizado basicamente pelo quadro clínico do paciente. A detecção do Parkinson de forma precoce é um desafio relevante, o que gerou novos estudos e desenvolvimento de novas ferramentas de diagnóstico para prever a doença e impedir o seu avanço. As técnicas de imagem são exames importantes que podem ser aplicados para o estadiamento do indivíduo. Este trabalho consiste em uma revisão bibliográfica narrativa, com o objetivo de apresentar o uso de técnicas de medicina nuclear capazes de identificar a patologia de forma precoc']

In [93]:
print(output[0])

['neurologia', ['doenc', 'parkinson', 'consider', 'patolog', 'neurodegener', 'afet', 'princip', 'idos', 'pod', 'ser', 'manifest', 'outr', 'form', 'individu', 'jov', 'send', 'caracter', 'diminuica', 'produca', 'dopamin', 'result', 'trem', 'involuntari', 'bradicines', 'perd', 'equilibri', 'diagnost', 'doenc', 'complex', 'realiz', 'basic', 'quadr', 'clin', 'paci', 'detecca', 'parkinson', 'form', 'precoc', 'desafi', 'relev', 'ger', 'nov', 'estud', 'desenvolv', 'nov', 'ferrament', 'diagnost', 'prev', 'doenc', 'imped', 'avanc', 'tecn', 'imag', 'sao', 'exam', 'import', 'pod', 'ser', 'aplic', 'estad', 'individu', 'trabalh', 'cons', 'revisa', 'bibliograf', 'narr', 'obje', 'apresent', 'uso', 'tecn', 'medicin', 'nucle', 'capaz', 'identific', 'patolog', 'form', 'precoc']]


In [77]:
# nltk.download('punkt')
# nltk.download('stopwords')
#nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     /Users/danielaalmeida/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


True

### SVM example

In [11]:
train_test_split?

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test

In [2]:
#Load dataset
cancer = datasets.load_breast_cancer()

In [3]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test

In [4]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [5]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.9649122807017544
Precision: 0.9811320754716981
Recall: 0.9629629629629629
