In [51]:
# load the training/validation resources and ontology data from AWS
from boto.s3.connection import S3Connection, Location
import datetime
import os
import time
import pickle
import diagnosis
from diagnosis.KeywordExtractor import *
import numpy as np
import re
import sklearn
import disease_label_table
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from diagnosis.utils import group_by, flatten
import warnings
import pymongo
from DataSet import fetch_datasets

In [3]:
with open('ontologies.p') as f:
    keywords = pickle.load(f)

In [4]:
categories = set([
    'hm/disease',
    'biocaster/pathogens',
    'biocaster/diseases',
    'biocaster/symptoms',
    'symp/symptoms',
    'eha/symptom',
    'eha/mode of transmission',
    'eha/environmental factors',
    'eha/vector',
    'eha/occupation',
    'eha/control measures',
    'eha/description of infected',
    'eha/disease category',
    'eha/host',
    'eha/host use',
    'eha/symptom',
    'eha/disease',
    'eha/location', 
    'eha/transmission',
    'eha/zoonotic type',
    'eha/risk',
    'wordnet/season',
    'wordnet/climate',
    'wordnet/pathogens',
    'wordnet/hosts',
    'wordnet/mod/severe',
    'wordnet/mod/painful',
    'wordnet/mod/large',
    'wordnet/mod/rare',
    'doid/has_symptom',
    'doid/symptoms',
    'doid/transmitted_by',
    'doid/located_in',
    'doid/diseases',
    'doid/results_in',
    'doid/has_material_basis_in',
    'usgs/terrain'
])

In [5]:
keyword_array = [
    keyword_obj for keyword_obj in keywords
    if keyword_obj['category'] in categories
]

In [6]:
feature_extractor = Pipeline([
    ('kwext', KeywordExtractor(keyword_array)),
    ('link', LinkedKeywordAdder(keyword_array)),
    ('limit', LimitCounts(1)),
])

In [7]:
def best_guess(classifier, X):
    probs = classifier.predict_proba(X)[0]
    p_max = max(probs)
    result = {}
    for i,p in enumerate(probs):
        cutoff_ratio = 0.65
        parents = disease_label_table.get_inferred_labels(classifier.classes_[i])
        if p >= p_max * cutoff_ratio:
            result[i] = max(p, result.get(i, 0))
            for i2, label in enumerate(classifier.classes_):
                if label in parents:
                    result[i2] = max(p, probs[i2], result.get(i2, 0))
    return result.items()

Fetch Datasets:

In [8]:
time_offset_test_set, mixed_test_set, training_set = fetch_datasets()

time_offset_test_set size 4748  | rejected items: 1540
mixed_test_set size 4017  | rejected items: 855
training_set size 12070  | rejected items: 2427


Load dataset pickles:

In [9]:
time_offset_test_set.feature_extractor =\
mixed_test_set.feature_extractor =\
training_set.feature_extractor = feature_extractor

In [10]:
my_dict_vectorizer = DictVectorizer(sparse=False).fit(training_set.get_feature_dicts())

In [12]:
time_offset_test_set.dict_vectorizer = \
mixed_test_set.dict_vectorizer = \
training_set.dict_vectorizer = my_dict_vectorizer

time_offset_test_set.remove_zero_feature_vectors()
mixed_test_set.remove_zero_feature_vectors()
training_set.remove_zero_feature_vectors()

Articles removed because of zero feature vectors:
295 / 4748
Articles removed because of zero feature vectors:
106 / 4017
Articles removed because of zero feature vectors:
353 / 12070


In [88]:
my_classifier = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)

In [80]:
my_classifier = OneVsRestClassifier(SVC(), n_jobs=-1)

In [83]:
my_classifier = DecisionTreeClassifier()

In [91]:
my_classifier = AdaBoostClassifier(DecisionTreeClassifier())

In [93]:
feature_array = np.array(training_set.get_feature_vectors()[0:10])

In [94]:
label_array = np.array(training_set.get_labels()[0:10])

## Dimensions

Feature array: 3812 x 11717

Label array: 1 x 11717



In [95]:
before_time = time.clock()
my_classifier.fit(feature_array, label_array)
after_time = time.clock()
"Training time: " + str(after_time - before_time)

  y = column_or_1d(y, warn=True)


'Training time: 0.101526'

Testing:

In [97]:
for data_set, ds_label, print_label_breakdown in [
    (training_set, "Training set", False),
    (time_offset_test_set, "Time offset set", True),
    (mixed_test_set, "Mixed test set", False),
]:
    before_time = time.clock()
    guesses = [best_guess(my_classifier, vector) for vector in data_set.get_feature_vectors()]
    y_pred = []
    for guess in guesses:
        y_pred.append([my_classifier.classes_[i] for (i, p) in guess])
    print (ds_label + "\n"
        "precision: %s recall: %s f-score: %s") %\
        sklearn.metrics.precision_recall_fscore_support(
            data_set.get_labels(add_parents=True),
            y_pred,
            average='micro')[0:3]
    after_time = time.clock()
    print("Testing time: " + str(after_time - before_time))



Training set
precision: 0.119911240079 recall: 0.0794099361329 f-score: 0.095545732744
Testing time: 3.001529
Time offset set
precision: 0.179878733438 recall: 0.125430629502 f-score: 0.14779961251




Testing time: 1.109504
Mixed test set
precision: 0.116594221427 recall: 0.0763691174008 f-score: 0.0922890103218
Testing time: 0.999387




### Logistic regression (old diagnosis method):

Training set (micro avg):
precision: 0.939472644119 recall: 0.940428418018 f-score: 0.939950288103

Time offset test set (micro avg):
precision: 0.801928783383 recall: 0.846382712183 f-score: 0.823556300472

Mixed test set (micro avg):
precision: 0.864066193853 recall: 0.856975381008 f-score: 0.860506180106

## OneVsMany(SVC)
Training time: 8.834822

Training set
precision: 0.108560211658 recall: 0.0996708979784 f-score: 0.103925813963

Time offset set
precision: 0.180327868852 recall: 0.169337832138 f-score: 0.174660141381

Mixed test set
precision: 0.11454870877 recall: 0.103392568659 f-score: 0.108685104318

## OneVsMany(Logistic Regression)
Training time: 4.495964

Training set
precision: 0.935211109867 recall: 0.943932628723 f-score: 0.939551630053

Time offset set
precision: 0.794883040936 recall: 0.851393673661 f-score: 0.822168456071

Mixed test set
precision: 0.858002004678 recall: 0.860157427567 f-score: 0.859078364138

## Decision Tree Regression
Training time: 17.307937

Training set
precision: 0.0847486557993 recall: 0.0778091208275 f-score: 0.0811307651456

Time offset set
precision: 0.12194026499 recall: 0.114508646141 f-score: 0.11810766721

Mixed test set
precision: 0.0915366913833 recall: 0.0826217401339 f-score: 0.0868510431829

## AdaBoost(Decision Tree Regression)
Training set
precision: 0.0976359136298 recall: 0.0896411220812 f-score: 0.0934678704195

Time offset set
precision: 0.211767347855 recall: 0.198861239983 f-score: 0.205111473627

Mixed test set
precision: 0.100230120174 recall: 0.0904684975767 f-score: 0.0950994662785
