In [2]:
import pandas as pd
import numpy as np
import nltk
from IPython.core.display import display, HTML

In [5]:
# Descargamos el corpus
#data_url = 'https://github.com/drbonis/icomem-ia-mar-2020/raw/master/data/coronavirus_case_reports.csv'
data_url = './data/coronavirus_case_reports.csv'
df = pd.read_csv(data_url)

In [6]:
df.head()

Unnamed: 0,src,coronavirus,length
0,BACKGROUND: Dengue is the most important human...,0,1634
1,"A 21 year-old man, HIV infected, and with poor...",0,550
2,BACKGROUND: Keratomycosis is one of the most p...,0,1306
3,Congenital toxoplasmosis continues to be a pub...,0,1047
4,The incidence of prosthetic shoulder replaceme...,0,563


In [8]:
# Vamos a vectorizar los textos
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=100000, min_df=5, max_df=0.8, stop_words=None)

In [9]:
X = vectorizer.fit_transform(list(df['src'])).toarray()

In [10]:
[(x, vectorizer.vocabulary_[x]) for x in list(vectorizer.vocabulary_.keys())[0:10]]

[('background', 474),
 ('dengue', 1051),
 ('most', 2519),
 ('important', 1948),
 ('human', 1844),
 ('viral', 4180),
 ('disease', 1166),
 ('transmitted', 3981),
 ('by', 598),
 ('it', 2136)]

In [11]:
print("Tamaño del diccionario de palabras generado: {}".format(len(vectorizer.vocabulary_)))


Tamaño del diccionario de palabras generado: 4273


In [12]:
html_code = """<div style='font-size: 1.2em'>BACKGROUND: Dengue is the most important human viral disease transmitted by mosquitoes. 
It can be asymptomatic or it can present in any of its 3clinical forms: Dengue fever, dengue haemorrhagic 
fever and dengue shock syndrome. However, some atypical manifestations have been reported in surgical emergencies 
caused by acute appendicitis in patients with dengue fever. CLINICAL CASE: We report the case of an 18-year-old 
Mexican male who presented to the emergency department of the General Hospital of Culiacan, Sinaloa, with symptoms 
of dengue fever, <span style='background-color: #b3b3ff'>accompanied</span> by crampy <span style='background-color: #bbff99'>abdominal</span> pain with positive Rovsing and Dunphy signs. Dengue infection was 
confirmed by a positive NS1 antigen test performed by enzyme-linked immunosorbent assay. An <span style='background-color: #bbff99'>abdominal</span> ultrasound 
revealed an appendicular process; as the <span style='background-color: #bbff99'>abdominal</span> pain in the right side kept increasing, an open appendectomy 
was performed. <span style='background-color: #ff9999'>Abundant</span> inflammatory liquid was observed during the surgery, and the pathology laboratory 
reported an oedematous appendix with fibrinopurulent plaques, which agreed with acute ulcerative appendicitis. 
The patient was discharged fully recovered without complications during the follow-up period. CONCLUSIONS: Acute 
<span style='background-color: #bbff99'>abdominal</span> pain can be caused in some cases by dengue infection. This can be confusing, which can lead to unnecessary 
surgical interventions, creating additional morbidities and costs for the patient. This unusual and coincident 
acute appendicitis with dengue highlights the importance of performing careful clinical studies for appropriate 
decision making, especially in dengue endemic regions during an outbreak of this disease.</div>"""
display(HTML(html_code))

In [13]:
X[0]

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
voc_inverse = {}
for x in list(vectorizer.vocabulary_.keys()):
  voc_inverse[vectorizer.vocabulary_[x]] = x

for i in range(104,131):
  print(i, voc_inverse[i], X[0][i])

104 abdomen 0
105 abdominal 4
106 ability 0
107 ablation 0
108 able 0
109 abnormal 0
110 abnormalities 0
111 abnormality 0
112 abortion 0
113 about 0
114 above 0
115 abscess 0
116 abscesses 0
117 absence 0
118 absent 0
119 abstinence 0
120 abundant 1
121 accelerated 0
122 acceptable 0
123 accepted 0
124 access 0
125 accessory 0
126 accident 0
127 accidental 0
128 accidentally 0
129 accidents 0
130 accompanied 1


In [20]:
X.shape

(2093, 4273)

In [21]:
# Vector con la clasificacioin coronavirus si/no
y = np.array(df['coronavirus'])
y.shape

(2093,)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
X_train.shape

(1674, 4273)

In [24]:
X_test.shape

(419, 4273)

In [25]:
# Entrenemos un modelo de regresión logística para clasificar los casos de coronavirus
# usamos para ello los 1674 casos del training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# Usemos el modelo para predecir los 419 casos que usamos para validar
y_pred = classifier.predict(X_test)

In [27]:
# Lo predicho
y_pred[0:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0])

In [28]:
# Lo real
y_test[0:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0])

In [29]:
# Vamos a generar unas métricas de evaluación
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

[[410   0]
 [  1   8]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       410
           1       1.00      0.89      0.94         9

    accuracy                           1.00       419
   macro avg       1.00      0.94      0.97       419
weighted avg       1.00      1.00      1.00       419

Accuracy: 0.9976133651551312


In [30]:
# Probemos con algunos casos nuevos, que no están en el corpus inicial
diabetes_example = """Diabetic patient with frequent urination, increased thirst, and increased hunger. 
Admited to the hospital due to a diabetic ketoacidosis episode. Personal historiy of cardiovascular disease, 
stroke, chronic kidney disease,  foot ulcers and diabetic retinopathy."""


In [31]:
# Vectorizamos usando el vectorizador generado con el training set.
X_diabetes = vectorizer.transform([diabetes_example]).toarray()
# y clasificamos usando el classifier entrenado con el training set.
classifier.predict_proba(X_diabetes)[0][1]

0.005254680927595615

In [32]:
# Ahora con un paciente donde aparece la palabra coronavirus

coronavirus_example = """76 years old women with fever, cough, flu-like symptons, 
since 3 days admitted to te emergency room.
She has had contact with patients with coronavirus disease 1 week ago. In the
Xray presents evidence of bilateral neumonia. During her stay in the ED she
developed severe respiratory distress and hypoxemia.
A chest TC revealed multifocal nodular 
consolidations with ground-glass opacity halo and mixed consolidation, 
mainly in the peripheral areas.
"""

X_coronavirus = vectorizer.transform([coronavirus_example]).toarray()
#X_coronavirus = tfidfconverter.transform(X_coronavirus).toarray()

classifier.predict_proba(X_coronavirus)[0][1]

0.183553450875725

In [33]:
# Ahora con un paciente donde no aparece la palabra coronavirus pero 
# que describe un cuadro clínico compatible con coronavirus

clinical_example = """76 years old women with fever, cough, flu-like symptons, 
since 3 days admitted to te emergency room.
In the Xray presents evidence of bilateral neumonia. During her stay in the ED she
developed severe respiratory distress and hypoxemia.
A chest TC revealed multifocal nodular 
consolidations with ground-glass opacity halo and mixed consolidation, 
mainly in the peripheral areas.
"""

X_clinical = vectorizer.transform([clinical_example]).toarray()
#X_clinical = tfidfconverter.transform(X_clinical).toarray()

classifier.predict_proba(X_clinical)[0][1]

0.065765072563565

### Sección TF/IDF (opcional)

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

classifier = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

[[410   0]
 [  2   7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       410
           1       1.00      0.78      0.88         9

    accuracy                           1.00       419
   macro avg       1.00      0.89      0.94       419
weighted avg       1.00      1.00      0.99       419

Accuracy: 0.9952267303102625


In [0]:
X_diabetes = vectorizer.transform([diabetes_example]).toarray()
X_diabetes = tfidfconverter.transform(X_diabetes).toarray()

In [0]:
classifier.predict_proba(X_diabetes)[0][1]

0.02135859364691269

In [0]:
X_coronavirus = vectorizer.transform([coronavirus_example]).toarray()
X_coronavirus = tfidfconverter.transform(X_coronavirus).toarray()

classifier.predict_proba(X_coronavirus)[0][1]


0.1126128529052829

In [0]:
X_clinical = vectorizer.transform([clinical_example]).toarray()
X_clinical = tfidfconverter.transform(X_clinical).toarray()

classifier.predict_proba(X_clinical)[0][1]

0.06511478083078727