In [4]:
# load the training/validation resources and ontology data from AWS
from boto.s3.connection import S3Connection, Location
import datetime
import os
import time
import pickle
import diagnosis
from diagnosis.KeywordExtractor import *
import numpy as np
import re
import sklearn
import disease_label_table
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from diagnosis.utils import group_by, flatten
import warnings
import pymongo
from DataSet import fetch_datasets

In [5]:
with open('ontologies.p') as f:
    keywords = pickle.load(f)

In [6]:
categories = set([
    'hm/disease',
    'biocaster/pathogens',
    'biocaster/diseases',
    'biocaster/symptoms',
    'symp/symptoms',
    'eha/symptom',
    'eha/mode of transmission',
    'eha/environmental factors',
    'eha/vector',
    'eha/occupation',
    'eha/control measures',
    'eha/description of infected',
    'eha/disease category',
    'eha/host',
    'eha/host use',
    'eha/symptom',
    'eha/disease',
    'eha/location', 
    'eha/transmission',
    'eha/zoonotic type',
    'eha/risk',
    'wordnet/season',
    'wordnet/climate',
    'wordnet/pathogens',
    'wordnet/hosts',
    'wordnet/mod/severe',
    'wordnet/mod/painful',
    'wordnet/mod/large',
    'wordnet/mod/rare',
    'doid/has_symptom',
    'doid/symptoms',
    'doid/transmitted_by',
    'doid/located_in',
    'doid/diseases',
    'doid/results_in',
    'doid/has_material_basis_in',
    'usgs/terrain'
])

In [7]:
keyword_array = [
    keyword_obj for keyword_obj in keywords
    if keyword_obj['category'] in categories
]

In [8]:
feature_extractor = Pipeline([
    ('kwext', KeywordExtractor(keyword_array)),
    ('link', LinkedKeywordAdder(keyword_array)),
    ('limit', LimitCounts(1)),
])

In [9]:
def best_guess(classifier, X):
    probs = classifier.predict_proba(X)[0]
    p_max = max(probs)
    result = {}
    
    for i,p in enumerate(probs):
        cutoff_ratio = 0.65
        parents = disease_label_table.get_inferred_labels(classifier.classes_[i])
        if p >= p_max * cutoff_ratio:
            result[i] = max(p, result.get(i, 0))
            for i2, label in enumerate(classifier.classes_):
                if label in parents:
                    result[i2] = max(p, probs[i2], result.get(i2, 0))
    return result.items()

Fetch Datasets:

In [11]:
time_offset_test_set, mixed_test_set, training_set = fetch_datasets()

time_offset_test_set size 4748  | rejected items: 1540
mixed_test_set size 4017  | rejected items: 855
training_set size 12070  | rejected items: 2427


In [12]:
time_offset_test_set.feature_extractor =\
mixed_test_set.feature_extractor =\
training_set.feature_extractor = feature_extractor

In [13]:
my_dict_vectorizer = DictVectorizer(sparse=False).fit(training_set.get_feature_dicts())

In [14]:
time_offset_test_set.dict_vectorizer = \
mixed_test_set.dict_vectorizer = \
training_set.dict_vectorizer = my_dict_vectorizer

time_offset_test_set.remove_zero_feature_vectors()
mixed_test_set.remove_zero_feature_vectors()
training_set.remove_zero_feature_vectors()

Articles removed because of zero feature vectors:
295 / 4748
Articles removed because of zero feature vectors:
106 / 4017
Articles removed because of zero feature vectors:
353 / 12070


In [41]:
feature_array = np.array(training_set.get_feature_vectors())

In [42]:
label_array = np.array(training_set.get_labels())

Testing:

In [26]:
classifiers = [
    (OneVsRestClassifier(LogisticRegression(), n_jobs=-1), "OneVsRest(Logistic Regression)", True),
    (DecisionTreeClassifier(), "Decision Tree Classifier", False),
    (AdaBoostClassifier(DecisionTreeClassifier()), "AdaBoost(Decision Tree Classifier)", False),
    (OneVsRestClassifier(SVC(probability=True), n_jobs=-1), "OneVsRest(SVC)", True)
]

In [43]:
with warnings.catch_warnings():
    # The updated version of scikit will spam warnings here.
    warnings.simplefilter("ignore")

    for (my_classifier, classifier_label, add_parents) in classifiers:
        print("### " + classifier_label)
        before_time = time.clock()
        my_classifier.fit(feature_array, label_array)
        after_time = time.clock()
        print("Training time: " + str(after_time - before_time) + "\n")

        before_time = time.clock()
        data_set = mixed_test_set.get_feature_vectors()

        y_true = mixed_test_set.get_labels()
        if add_parents:
            guesses = [best_guess(my_classifier, vector) for vector in data_set]
            y_pred = []
            for guess in guesses:
                y_pred.append([my_classifier.classes_[i] for (i, p) in guess])
        else:
            y_pred = my_classifier.predict(data_set)
        print ("Mixed test set" + " (micro) \n"
            "precision: %s recall: %s f-score: %s\n") %\
            sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average='micro')[0:3]
        after_time = time.clock()
        print("Testing time: " + str(after_time - before_time) + '\n')


### OneVsRest(Logistic Regression)
Training time: 5.552573

Mixed test set (micro) 
precision: 0.610424323421 recall: 0.843295638126 f-score: 0.708208159705

Testing time: 48.357955

### Decision Tree Classifier
Training time: 16.90565

Mixed test set (micro) 
precision: 0.762287334594 recall: 0.744518809139 f-score: 0.753298307064

Testing time: 1.461998

### AdaBoost(Decision Tree Classifier)
Training time: 1370.665474

Mixed test set (micro) 
precision: 0.812804284323 recall: 0.770597738288 f-score: 0.7911384907

Testing time: 7.755878

### OneVsRest(SVC)
Training time: 13.489025

Mixed test set (micro) 
precision: 0.580855461918 recall: 0.843064851142 f-score: 0.687817736773

Testing time: 504.577304



## m = 500

### OneVsRest(Logistic Regression)
Training time: 6.332283

Mixed test set (micro) 
precision: 0.400906953309 recall: 0.550888529887 f-score: 0.464080878779

Testing time: 20.615556

### Decision Tree Classifier
Training time: 0.12318

Mixed test set (micro) 
precision: 0.476247600768 recall: 0.458112162474 f-score: 0.467003881896

Testing time: 1.44014

### AdaBoost(Decision Tree Classifier)
Training time: 6.071795

Mixed test set (micro) 
precision: 0.490306748466 recall: 0.461112393261 f-score: 0.475261655566

Testing time: 2.950545

### OneVsRest(SVC)
Training time: 7.336626

Mixed test set (micro) 
precision: 0.395055499495 recall: 0.54211862451 f-score: 0.457048351007

Testing time: 61.903635

## m = 1000

### OneVsRest(Logistic Regression)
Training time: 14.271383

Mixed test set (micro) 
precision: 0.447527749748 recall: 0.614124163397 f-score: 0.517754645394

Testing time: 29.604462

### Decision Tree Classifier
Training time: 0.407042

Mixed test set (micro) 
precision: 0.481132075472 recall: 0.506115855066 f-score: 0.493307839388

Testing time: 1.418815

### AdaBoost(Decision Tree Classifier)
Training time: 23.979863

Mixed test set (micro) 
precision: 0.514231945425 recall: 0.50450034618 f-score: 0.509319664492

Testing time: 3.718768

### OneVsRest(SVC)
Training time: 15.213411

Mixed test set (micro) 
precision: 0.443290190736 recall: 0.600738518348 f-score: 0.510142087212

Testing time: 105.059295

## m = 1500

### OneVsRest(Logistic Regression)
Training time: 22.132444

Mixed test set (micro) 
precision: 0.455324760963 recall: 0.637433648742 f-score: 0.53120492355

Testing time: 31.525444

### Decision Tree Classifier
Training time: 0.730307

Mixed test set (micro) 
precision: 0.549645390071 recall: 0.536579736903 f-score: 0.543033983417

Testing time: 1.496526

### AdaBoost(Decision Tree Classifier)
Training time: 42.194042

Mixed test set (micro) 
precision: 0.565987384765 recall: 0.538426032772 f-score: 0.551862803075

Testing time: 3.969732

### OneVsRest(SVC)
Training time: 24.355333

Mixed test set (micro) 
precision: 0.438128140704 recall: 0.643895684283 f-score: 0.521446593776

Testing time: 127.603342

## m = 5000
### OneVsRest(Logistic Regression)
Training time: 3.940271

Mixed test set (micro) 
precision: 0.592518703242 recall: 0.822524809601 f-score: 0.688828759181

Testing time: 38.07443

### Decision Tree Classifier
Training time: 4.146893

Mixed test set (micro) 
precision: 0.741920152091 recall: 0.720516962843 f-score: 0.731061936541

Testing time: 1.408146

### AdaBoost(Decision Tree Classifier)
Training time: 269.452073

Mixed test set (micro) 
precision: 0.774327628362 recall: 0.730902377106 f-score: 0.751988602636

Testing time: 5.184815

### OneVsRest(SVC)
Training time: 7.266914

Mixed test set (micro) 
precision: 0.564147348365 recall: 0.819986152781 f-score: 0.668422537861

Testing time: 281.247301
