# Detecção de textos falando sobre Influenza a partir dos sintomas encontrados

In [13]:
from ontologies.symptom import flu_symptoms, symptoms, all_flu_symptoms
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

## Corpus a ser analisado
- Insira na variável **corpus** o texto que deve ser analisado

In [2]:
corpus = "I have a lot of cough and fever."

In [2]:
corpus = """
Initially, the flu may seem like a common cold with a runny nose, sneezing and sore throat. But colds usually develop slowly, whereas the flu tends to come on suddenly. And although a cold can be a nuisance, you usually feel much worse with the flu.

Common signs and symptoms of the flu include:

Fever over 100.4 F (38 C)
Aching muscles
Chills and sweats
Headache
Dry, persistent cough
Fatigue and weakness
Nasal congestion
Sore throat
"""

## Etapa de pré-processamento

Nesta etapa será processado o texto de modo a remover stop-words e criar n-grams (2-grams) para ser identificado os sintomas presentes.

In [3]:
def preproc(text):
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    
    text = text.lower()
    tokens = list(filter(lambda w: w not in stop_words, tokenizer.tokenize(text)))
    n_grams_tokens = list(map(lambda s: ' '.join(s), ngrams(tokens, n=2)))
    tokens += n_grams_tokens
    
    return tokens

In [4]:
words = preproc(corpus)
print(words)

['initially', 'flu', 'may', 'seem', 'like', 'common', 'cold', 'runny', 'nose', 'sneezing', 'sore', 'throat', 'colds', 'usually', 'develop', 'slowly', 'whereas', 'flu', 'tends', 'come', 'suddenly', 'although', 'cold', 'nuisance', 'usually', 'feel', 'much', 'worse', 'flu', 'common', 'signs', 'symptoms', 'flu', 'include', 'fever', '100', '4', 'f', '38', 'c', 'aching', 'muscles', 'chills', 'sweats', 'headache', 'dry', 'persistent', 'cough', 'fatigue', 'weakness', 'nasal', 'congestion', 'sore', 'throat', 'initially flu', 'flu may', 'may seem', 'seem like', 'like common', 'common cold', 'cold runny', 'runny nose', 'nose sneezing', 'sneezing sore', 'sore throat', 'throat colds', 'colds usually', 'usually develop', 'develop slowly', 'slowly whereas', 'whereas flu', 'flu tends', 'tends come', 'come suddenly', 'suddenly although', 'although cold', 'cold nuisance', 'nuisance usually', 'usually feel', 'feel much', 'much worse', 'worse flu', 'flu common', 'common signs', 'signs symptoms', 'symptoms

## Encontrando os sintomas com base em ontologias 
- Utilizado as ontologias: SymptomOntology e FluOntology

In [10]:
symp = symptoms(words, n_threads=8)
flu_symp = flu_symptoms(words, n_threads=1)

print(symp)
print(flu_symp)

{'runny nose', 'sneezing', 'fever', 'headache', 'congestion', 'nasal congestion', 'fatigue', 'weakness', 'cough', 'chills'}
{'runny nose', 'headache', 'fever', 'nasal congestion', 'fatigue', 'cough'}


### Aplicação da similaridade de Jaccard entre os sintomas

In [15]:
def jaccard(a, b):
    i = len(a.intersection(b))
    u = len(a.union(b))
    return i / u if u > 0 else 1.

- Similaridade entre todos os sintomas do texto com os sintomas de influenza contidos no texto

In [16]:
jaccard(symp, flu_symp)

0.6

- Similaridade entre os sintomas de influenza contidos no texto e todos presentes na ontologia

In [None]:
jaccard(symp, all_)