In [154]:
# load the training/validation resources and ontology data from AWS
from boto.s3.connection import S3Connection, Location
import datetime
import os
import pickle
import diagnosis
from diagnosis.KeywordExtractor import *
import numpy as np
import re
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from diagnosis.utils import group_by, flatten
import warnings
import pymongo
from DataSet import fetch_datasets

In [2]:
with open('ontologies.p') as f:
    keywords = pickle.load(f)

In [3]:
categories = set([
    'hm/disease',
    'biocaster/pathogens',
    'biocaster/diseases',
    'biocaster/symptoms',
    'symp/symptoms',
    'eha/symptom',
    'eha/mode of transmission',
    'eha/environmental factors',
    'eha/vector',
    'eha/occupation',
    'eha/control measures',
    'eha/description of infected',
    'eha/disease category',
    'eha/host',
    'eha/host use',
    'eha/symptom',
    'eha/disease',
    'eha/location', 
    'eha/transmission',
    'eha/zoonotic type',
    'eha/risk',
    'wordnet/season',
    'wordnet/climate',
    'wordnet/pathogens',
    'wordnet/hosts',
    'wordnet/mod/severe',
    'wordnet/mod/painful',
    'wordnet/mod/large',
    'wordnet/mod/rare',
    'doid/has_symptom',
    'doid/symptoms',
    'doid/transmitted_by',
    'doid/located_in',
    'doid/diseases',
    'doid/results_in',
    'doid/has_material_basis_in',
    'usgs/terrain'
])

In [4]:
keyword_array = [
    keyword_obj for keyword_obj in keywords
    if keyword_obj['category'] in categories
]

In [5]:
feature_extractor = Pipeline([
    ('kwext', KeywordExtractor(keyword_array)),
    ('link', LinkedKeywordAdder(keyword_array)),
    ('limit', LimitCounts(1)),
])

Refresh dataset pickles:

In [None]:
# time_offset_test_set, mixed_test_set, training_set = fetch_datasets()
# pickle_dir = 'pickles'
# with open(os.path.join(pickle_dir, 'time_offset_test_set.p'), 'wb') as f:
#     pickle.dump(time_offset_test_set, f)
# with open(os.path.join(pickle_dir, 'mixed_test_set.p'), 'wb') as f:
#     pickle.dump(mixed_test_set, f)
# with open(os.path.join(pickle_dir, 'training_set.p'), 'wb') as f:
#     pickle.dump(training_set, f)

Load dataset pickles:

In [6]:
pickle_dir = 'pickles'
with open(os.path.join(pickle_dir, 'time_offset_test_set.p')) as f:
    time_offset_test_set = pickle.load(f)
with open(os.path.join(pickle_dir, 'mixed_test_set.p')) as f:
    mixed_test_set = pickle.load(f)
with open(os.path.join(pickle_dir, 'training_set.p')) as f:
    training_set = pickle.load(f)

In [7]:
time_offset_test_set.feature_extractor =\
mixed_test_set.feature_extractor =\
training_set.feature_extractor = feature_extractor

In [8]:
my_dict_vectorizer = DictVectorizer(sparse=False).fit(training_set.get_feature_dicts())

In [9]:
time_offset_test_set.dict_vectorizer = \
mixed_test_set.dict_vectorizer = \
training_set.dict_vectorizer = my_dict_vectorizer

time_offset_test_set.remove_zero_feature_vectors()
mixed_test_set.remove_zero_feature_vectors()
training_set.remove_zero_feature_vectors()

Articles removed because of zero feature vectors:
0 / 4453
Articles removed because of zero feature vectors:
0 / 3911
Articles removed because of zero feature vectors:
0 / 11717


In [132]:
my_classifier = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)

In [150]:
my_classifier = AdaBoostClassifier(DecisionTreeClassifier())

In [125]:
feature_array = np.array(training_set.get_feature_vectors())

In [126]:
label_array = np.array(training_set.get_labels())

## Dimensions

Feature array: 3812 x 11717

Label array: 1 x 11717



In [151]:
my_classifier.fit(feature_array, label_array)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)

Testing:

In [147]:
vectors = mixed_test_set.get_feature_vectors()
my_classifier.predict(vectors)

array([u'Measles', u'Measles', u'Measles', u'Measles', u'West Nile Virus',
       u'West Nile Virus', u'West Nile Virus', u'West Nile Virus',
       u'West Nile Virus', u'Measles', u'West Nile Virus',
       u'West Nile Virus', u'West Nile Virus', u'West Nile Virus',
       u'Measles', u'West Nile Virus', u'West Nile Virus',
       u'West Nile Virus', u'West Nile Virus', u'Measles',
       u'West Nile Virus', u'West Nile Virus', u'West Nile Virus',
       u'West Nile Virus', u'West Nile Virus', u'West Nile Virus',
       u'Measles', u'West Nile Virus', u'West Nile Virus', u'Measles',
       u'West Nile Virus', u'Measles', u'West Nile Virus', u'Measles',
       u'West Nile Virus', u'Measles', u'Measles', u'West Nile Virus',
       u'West Nile Virus', u'Measles', u'West Nile Virus',
       u'West Nile Virus', u'Measles', u'Measles', u'West Nile Virus',
       u'West Nile Virus', u'West Nile Virus', u'Measles',
       u'West Nile Virus', u'West Nile Virus', u'Measles',
       u'West Nile 

In [115]:
y_true = training_set.get_labels()[0:20]
y_pred = my_classifier.predict(training_set.get_feature_vectors())
# y_pred = [[y] for y in y_pred_flat]
print y_pred

[(u'Influenza',), (u'Measles',), (u'West Nile Virus',), (u'Influenza',), (u'Waterborne Illness',), (u'West Nile Virus',), (), (u'Measles',), (u'Measles',), (u'Measles',), (), (u'Waterborne Illness',), (u'Measles',), (), (u'Influenza',), (u'West Nile Virus',), (u'Influenza',), (u'Waterborne Illness',), (u'West Nile Virus',), (u'Waterborne Illness',), (u'Influenza',), (u'West Nile Virus',), (u'Measles',), (), (), (u'Measles',), (u'Influenza',), (u'West Nile Virus',), (), (u'Waterborne Illness',), (u'Influenza',), (), (), (u'Measles',), (u'West Nile Virus',), (u'Measles',), (u'Influenza',), (u'West Nile Virus',), (), (), (), (), (), (), (), (), (u'Waterborne Illness',), (), (u'Influenza',), (u'Measles',), (u'Influenza',), (u'West Nile Virus',), (u'Waterborne Illness',), (u'Influenza',), (), (), (u'Influenza',), (u'West Nile Virus',), (u'West Nile Virus',), (u'Waterborne Illness',), (u'West Nile Virus',), (u'Influenza',), (u'West Nile Virus',), (u'West Nile Virus',), (), (), (), (), (), (u

In [153]:
for data_set, ds_label, print_label_breakdown in [
    (training_set, "Training set", False),
    (time_offset_test_set, "Time offset set", True),
    (mixed_test_set, "Mixed test set", False),
]:
    y_pred_flat = my_classifier.predict(data_set.get_feature_vectors())
    y_pred = [[y] for y in y_pred_flat]
    print (ds_label + "\n"
        "precision: %s recall: %s f-score: %s") %\
        sklearn.metrics.precision_recall_fscore_support(
            data_set.get_labels(),
            y_pred,
            average='micro')[0:3]

Training set
precision: 0.0976359136298 recall: 0.0896411220812 f-score: 0.0934678704195
Time offset set
precision: 0.211767347855 recall: 0.198861239983 f-score: 0.205111473627
Mixed test set
precision: 0.100230120174 recall: 0.0904684975767 f-score: 0.0950994662785


### Logistic regression (old diagnosis method):

Training set (micro avg):
precision: 0.939472644119 recall: 0.940428418018 f-score: 0.939950288103

Time offset test set (micro avg):
precision: 0.801928783383 recall: 0.846382712183 f-score: 0.823556300472

Mixed test set (micro avg):
precision: 0.864066193853 recall: 0.856975381008 f-score: 0.860506180106

# m = 5
## OneVsMany(SVC)
Training set
precision: 0.108560211658 recall: 0.0996708979784 f-score: 0.103925813963

Time offset set
precision: 0.180327868852 recall: 0.169337832138 f-score: 0.174660141381

Mixed test set
precision: 0.11454870877 recall: 0.103392568659 f-score: 0.108685104318

## OneVsMany(Logistic Regression)
Training set
precision: 0.10745071264 recall: 0.0986522488638 f-score: 0.102863679072

Time offset set
precision: 0.169773186616 recall: 0.159426402362 f-score: 0.164437194127

Mixed test set
precision: 0.111991817949 recall: 0.101084698823 f-score: 0.106259097525

## Decision Tree Regression
Training set
precision: 0.0847486557993 recall: 0.0778091208275 f-score: 0.0811307651456

Time offset set
precision: 0.12194026499 recall: 0.114508646141 f-score: 0.11810766721

Mixed test set
precision: 0.0915366913833 recall: 0.0826217401339 f-score: 0.0868510431829

## AdaBoost(Decision Tree Regression)
Training set
precision: 0.0976359136298 recall: 0.0896411220812 f-score: 0.0934678704195

Time offset set
precision: 0.211767347855 recall: 0.198861239983 f-score: 0.205111473627

Mixed test set
precision: 0.100230120174 recall: 0.0904684975767 f-score: 0.0950994662785
