In [18]:
# load the training/validation resources and ontology data from AWS
from boto.s3.connection import S3Connection, Location
import datetime
import os
import pickle
import diagnosis
from diagnosis.KeywordExtractor import *
from diagnosis.Diagnoser import Diagnoser
import numpy as np
import re
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from diagnosis.utils import group_by, flatten
import warnings
import pymongo
from DataSet import fetch_datasets

In [2]:
with open('ontologies.p') as f:
    keywords = pickle.load(f)

In [3]:
categories = set([
    'hm/disease',
    'biocaster/pathogens',
    'biocaster/diseases',
    'biocaster/symptoms',
    'symp/symptoms',
    'eha/symptom',
    'eha/mode of transmission',
    'eha/environmental factors',
    'eha/vector',
    'eha/occupation',
    'eha/control measures',
    'eha/description of infected',
    'eha/disease category',
    'eha/host',
    'eha/host use',
    'eha/symptom',
    'eha/disease',
    'eha/location', 
    'eha/transmission',
    'eha/zoonotic type',
    'eha/risk',
    'wordnet/season',
    'wordnet/climate',
    'wordnet/pathogens',
    'wordnet/hosts',
    'wordnet/mod/severe',
    'wordnet/mod/painful',
    'wordnet/mod/large',
    'wordnet/mod/rare',
    'doid/has_symptom',
    'doid/symptoms',
    'doid/transmitted_by',
    'doid/located_in',
    'doid/diseases',
    'doid/results_in',
    'doid/has_material_basis_in',
    'usgs/terrain'
])

In [4]:
keyword_array = [
    keyword_obj for keyword_obj in keywords
    if keyword_obj['category'] in categories
]

In [5]:
feature_extractor = Pipeline([
    ('kwext', KeywordExtractor(keyword_array)),
    ('link', LinkedKeywordAdder(keyword_array)),
    ('limit', LimitCounts(1)),
])

Refresh dataset pickles:

In [None]:
# time_offset_test_set, mixed_test_set, training_set = fetch_datasets()
# with open(os.path.join(pickle_dir, 'time_offset_test_set.p'), 'wb') as f:
#     pickle.dump(time_offset_test_set, f)
# with open(os.path.join(pickle_dir, 'mixed_test_set.p'), 'wb') as f:
#     pickle.dump(mixed_test_set, f)
# with open(os.path.join(pickle_dir, 'training_set.p'), 'wb') as f:
#     pickle.dump(training_set, f)

Load dataset pickles:

In [6]:
pickle_dir = 'pickles'
with open(os.path.join(pickle_dir, 'time_offset_test_set.p')) as f:
    time_offset_test_set = pickle.load(f)
with open(os.path.join(pickle_dir, 'mixed_test_set.p')) as f:
    mixed_test_set = pickle.load(f)
with open(os.path.join(pickle_dir, 'training_set.p')) as f:
    training_set = pickle.load(f)

In [7]:
time_offset_test_set.feature_extractor =\
mixed_test_set.feature_extractor =\
training_set.feature_extractor = feature_extractor

In [8]:
my_dict_vectorizer = DictVectorizer(sparse=False).fit(training_set.get_feature_dicts())

In [9]:
time_offset_test_set.dict_vectorizer = \
mixed_test_set.dict_vectorizer = \
training_set.dict_vectorizer = my_dict_vectorizer

time_offset_test_set.remove_zero_feature_vectors()
mixed_test_set.remove_zero_feature_vectors()
training_set.remove_zero_feature_vectors()

Articles removed because of zero feature vectors:
0 / 4453
Articles removed because of zero feature vectors:
0 / 3911
Articles removed because of zero feature vectors:
0 / 11717


In [10]:
# my_classifier = OneVsRestClassifier(LogisticRegression(), n_jobs=-1)

In [37]:
my_classifier = OneVsRestClassifier(DecisionTreeRegressor(), n_jobs=-1)

In [11]:
feature_array = np.array(training_set.get_feature_vectors())

In [35]:
label_array = np.array(training_set.get_labels())

## Dimensions

Feature array: 3812 x 11717

Label array: 1 x 11717



In [None]:
my_classifier.fit(feature_array, label_array)

Testing:

In [14]:
my_diagnoser = Diagnoser(
    my_classifier,
    my_dict_vectorizer,
    keyword_array=keyword_array,
    cutoff_ratio=.7
)

In [15]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    train_label_set = set(flatten(training_set.get_labels(), 1))
    for data_set, ds_label, print_label_breakdown in [
        (training_set, "Training set", False),
        (time_offset_test_set, "Time offset set", True),
        (mixed_test_set, "Mixed test set", False),
    ]:
        if len(data_set) == 0: continue
        validation_label_set = set(flatten(data_set.get_labels(), 1))
        not_in_train = [
            label for label in validation_label_set
            if (label not in train_label_set)
        ]

        predictions = [
            tuple([
                my_diagnoser.classifier.classes_[i]
                for i, p in my_diagnoser.best_guess(X)
            ])
            for X in data_set.get_feature_vectors()
        ]

        print ("Validation set (micro avg):\n"
            "precision: %s recall: %s f-score: %s") %\
            sklearn.metrics.precision_recall_fscore_support(
                data_set.get_labels(add_parents=True),
                predictions,
                average='micro')[0:3]

Validation set (micro avg):
precision: 0.939472644119 recall: 0.940428418018 f-score: 0.939950288103
Validation set (micro avg):
precision: 0.801928783383 recall: 0.846382712183 f-score: 0.823556300472
Validation set (micro avg):
precision: 0.864066193853 recall: 0.856975381008 f-score: 0.860506180106


### Logistic regression:

Training set (micro avg):
precision: 0.939472644119 recall: 0.940428418018 f-score: 0.939950288103

Time offset test set (micro avg):
precision: 0.801928783383 recall: 0.846382712183 f-score: 0.823556300472

Mixed test set (micro avg):
precision: 0.864066193853 recall: 0.856975381008 f-score: 0.860506180106