In [10]:
# load the training/validation resources and ontology data from AWS
from boto.s3.connection import S3Connection, Location
import datetime
import os
import pickle
import diagnosis
from diagnosis.KeywordExtractor import *
from diagnosis.Diagnoser import Diagnoser
import numpy as np
import re
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
from diagnosis.utils import group_by, flatten
import warnings
import pymongo
import test_classifier
from DataSet import fetch_datasets

In [2]:
with open('ontologies.p') as f:
    keywords = pickle.load(f)

In [3]:
categories = set([
    'hm/disease',
    'biocaster/pathogens',
    'biocaster/diseases',
    'biocaster/symptoms',
    'symp/symptoms',
    'eha/symptom',
    'eha/mode of transmission',
    'eha/environmental factors',
    'eha/vector',
    'eha/occupation',
    'eha/control measures',
    'eha/description of infected',
    'eha/disease category',
    'eha/host',
    'eha/host use',
    'eha/symptom',
    'eha/disease',
    'eha/location', 
    'eha/transmission',
    'eha/zoonotic type',
    'eha/risk',
    'wordnet/season',
    'wordnet/climate',
    'wordnet/pathogens',
    'wordnet/hosts',
    'wordnet/mod/severe',
    'wordnet/mod/painful',
    'wordnet/mod/large',
    'wordnet/mod/rare',
    'doid/has_symptom',
    'doid/symptoms',
    'doid/transmitted_by',
    'doid/located_in',
    'doid/diseases',
    'doid/results_in',
    'doid/has_material_basis_in',
    'usgs/terrain'
])

In [4]:
keyword_array = [
    keyword_obj for keyword_obj in keywords
    if keyword_obj['category'] in categories
]

In [5]:
feature_extractor = Pipeline([
    ('kwext', KeywordExtractor(keyword_array)),
    ('link', LinkedKeywordAdder(keyword_array)),
    ('limit', LimitCounts(1)),
])

In [6]:
time_offset_test_set, mixed_test_set, training_set = fetch_datasets()

time_offset_test_set.feature_extractor =\
mixed_test_set.feature_extractor =\
training_set.feature_extractor = feature_extractor

time_offset_test_set size 4748  | rejected items: 1540
mixed_test_set size 4017  | rejected items: 855
training_set size 12070  | rejected items: 2427


In [7]:
my_dict_vectorizer = DictVectorizer(sparse=False).fit(training_set.get_feature_dicts())

In [8]:
time_offset_test_set.dict_vectorizer = \
mixed_test_set.dict_vectorizer = \
training_set.dict_vectorizer = my_dict_vectorizer

time_offset_test_set.remove_zero_feature_vectors()
mixed_test_set.remove_zero_feature_vectors()
training_set.remove_zero_feature_vectors()

Articles removed because of zero feature vectors:
295 / 4748
Articles removed because of zero feature vectors:
106 / 4017
Articles removed because of zero feature vectors:
353 / 12070


In [11]:
my_classifier = OneVsRestClassifier(LinearRegression(), n_jobs=-1)

In [12]:
feature_array = np.array(training_set.get_feature_vectors())

In [None]:
label_array = np.array(training_set.get_labels())

In [None]:
my_classifier.fit(feature_array, label_array)



Testing:

In [None]:
my_diagnoser = Diagnoser(
    my_classifier,
    my_dict_vectorizer,
    keyword_array=keyword_array,
    cutoff_ratio=.7
)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    train_label_set = set(flatten(training_set.get_labels(), 1))
    for data_set, ds_label, print_label_breakdown in [
        (training_set, "Training set", False),
        (time_offset_test_set, "Time offset set", True),
        (mixed_test_set, "Mixed test set", False),
    ]:
        if len(data_set) == 0: continue
        validation_label_set = set(flatten(data_set.get_labels(), 1))
        not_in_train = [
            label for label in validation_label_set
            if (label not in train_label_set)
        ]

        predictions = [
            tuple([
                my_diagnoser.classifier.classes_[i]
                for i, p in my_diagnoser.best_guess(X)
            ])
            for X in data_set.get_feature_vectors()
        ]

        print ("Validation set (micro avg):\n"
            "precision: %s recall: %s f-score: %s") %\
            sklearn.metrics.precision_recall_fscore_support(
                data_set.get_labels(add_parents=True),
                predictions,
                average='micro')[0:3]