# Detecção de textos falando sobre Influenza a partir dos sintomas encontrados

In [1]:
from ontologies.symptom import flu_symptoms, symptoms, all_flu_symptoms
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

## Corpus a ser analisado
- Insira na variável **corpus** o texto que deve ser analisado

In [2]:
corpus = "I have a lot of cough and fever."

In [2]:
corpus = """
Initially, the flu may seem like a common cold with a runny nose, sneezing and sore throat. But colds usually develop slowly, whereas the flu tends to come on suddenly. And although a cold can be a nuisance, you usually feel much worse with the flu.

Common signs and symptoms of the flu include:

Fever over 100.4 F (38 C)
Aching muscles
Chills and sweats
Headache
Dry, persistent cough
Fatigue and weakness
Nasal congestion
Sore throat
"""

In [2]:
corpus = """
Tessa has the swine flu. Luckily not much worse than a cough and on/off fever.  Getting better!  Time feels weird. Been outta work 2 days.
"""

## Etapa de pré-processamento

Nesta etapa será processado o texto de modo a remover stop-words e criar n-grams (2-grams) para ser identificado os sintomas presentes.

In [3]:
def preproc(text):
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words('english'))
    
    text = text.lower()
    tokens = list(filter(lambda w: w not in stop_words, tokenizer.tokenize(text)))
    n_grams_tokens = list(map(lambda s: ' '.join(s), ngrams(tokens, n=2)))
    tokens += n_grams_tokens
    
    return tokens

In [4]:
words = preproc(corpus)
print(words)

['initially', 'flu', 'may', 'seem', 'like', 'common', 'cold', 'runny', 'nose', 'sneezing', 'sore', 'throat', 'colds', 'usually', 'develop', 'slowly', 'whereas', 'flu', 'tends', 'come', 'suddenly', 'although', 'cold', 'nuisance', 'usually', 'feel', 'much', 'worse', 'flu', 'common', 'signs', 'symptoms', 'flu', 'include', 'fever', '100', '4', 'f', '38', 'c', 'aching', 'muscles', 'chills', 'sweats', 'headache', 'dry', 'persistent', 'cough', 'fatigue', 'weakness', 'nasal', 'congestion', 'sore', 'throat', 'initially flu', 'flu may', 'may seem', 'seem like', 'like common', 'common cold', 'cold runny', 'runny nose', 'nose sneezing', 'sneezing sore', 'sore throat', 'throat colds', 'colds usually', 'usually develop', 'develop slowly', 'slowly whereas', 'whereas flu', 'flu tends', 'tends come', 'come suddenly', 'suddenly although', 'although cold', 'cold nuisance', 'nuisance usually', 'usually feel', 'feel much', 'much worse', 'worse flu', 'flu common', 'common signs', 'signs symptoms', 'symptoms

## Encontrando os sintomas com base em ontologias 
- Utilizado as ontologias: SymptomOntology e FluOntology

In [5]:
flu_symp = flu_symptoms(words, n_threads=1)
symp = symptoms(words, n_threads=1)

print(symp)
print(flu_symp)

{'sneezing', 'fatigue', 'fever', 'cough', 'congestion', 'runny nose', 'nasal congestion', 'headache', 'chills', 'weakness'}
{'fatigue', 'fever', 'cough', 'runny nose', 'nasal congestion', 'headache'}


### Aplicação da similaridade de Jaccard entre os sintomas

In [6]:
def jaccard(a, b):
    i = len(a.intersection(b))
    u = len(a.union(b))
    return i / u if u > 0 else 1.

- Similaridade entre todos os sintomas do texto com os sintomas de influenza contidos no texto

In [7]:
jaccard(symp, flu_symp)

0.6

- Similaridade entre os sintomas de influenza contidos no texto e todos presentes na ontologia

In [8]:
jaccard(flu_symp, all_flu_symptoms())

0.75

## Analisando mensagens do twitter coletadas

In [9]:
twitter_texts = []
twitter_fp = '../twitter-data/tweets.csv'
with open(twitter_fp, 'r') as fin:
    lines = map(lambda l: l.split(','), fin.read().split('\n')[1:-1])
    tweets = list(map(lambda l: l[1] if len(l) >= 2 else l[0], lines))

In [10]:
print('number of tweets: {}'.format(len(tweets)))

number of tweets: 4523


- Para cada twitter calcular as duas similaridades apresentadas anteriormente entre os sintomas

In [11]:
tweets_metrics = []
all_flu_symps = all_flu_symptoms()
for i, tweet in enumerate(tweets):
    words = preproc(tweet)
    flu_symp = flu_symptoms(words)
    symp = symptoms(words)
    
    flu_symptoms_sim = jaccard(flu_symp, symp)
    all_flu_symptoms_sim = jaccard(flu_symp, all_flu_symps)
    tweets_metrics.append((tweet, flu_symptoms_sim, all_flu_symptoms_sim))

In [12]:
tweets_metrics.sort(key=lambda t: (t[1], t[2]), reverse=True)

In [13]:
for t, fs, afs in tweets_metrics[:10]:
    print('Tweet: {} Flu Sim: {} All Flu Sim: {}'.format(t, fs, afs))

Tweet: Tessa has the swine flu. Luckily not much worse than a cough and on/off fever.  Getting better!  Time feels weird. Been outta work 2 days. Flu Sim: 1.0 All Flu Sim: 0.25
Tweet: 's whole family is freaking out thinking that I have the swine flu.  since when is a headache and a fever the swine flu? Flu Sim: 1.0 All Flu Sim: 0.25
Tweet: "I hav a headache and a hard cough where u cough (not often) but wen u do Flu Sim: 1.0 All Flu Sim: 0.25
Tweet: "@GraceyJones I've been hugging her most of the day!  Motrin's just reducing fever Flu Sim: 1.0 All Flu Sim: 0.125
Tweet: H1N1 Flu Update: Better Not Cough! Santas Want Swine Flu Shots: Santas across the nation worry .. http://bit.ly/2RS6Xn Flu Sim: 1.0 All Flu Sim: 0.125
Tweet: "headache from hell Flu Sim: 1.0 All Flu Sim: 0.125
Tweet: I have to take a shot of cough syrup. This coughing is getting to be a bit much. This 2day flu is no joke. At least I hope its 2day flu. :-) Flu Sim: 1.0 All Flu Sim: 0.125
Tweet: Hagan official has the flu