This notebook trains an RF and NB agent on the extended data.

A separate dataset would be used to train the RL agent.

The downside to this method is that the RL method relies heavily on the accuracy of the RF and NB models and would be limited by their accuracy.

In [1]:
#module_dir = "/Users/teliov/TUD/symcat-to-synthea/output/module_ai_med_extended"
module_dir = "/home/gzr/文档/medvice/thesis/data/NLICE"

In [1]:
from thesislib.utils.ml import process
from sklearn.model_selection import StratifiedShuffleSplit
import pathlib
import json
import os
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models
import pandas as pd

In [None]:
symptom_map, condition_map = process.get_symptom_condition_map(module_dir)
print(symptom_map)

In [2]:
import pathlib

In [None]:
#ata_dir = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/06_18_nlice_plus/extended"
data_dir = "/home/gzr/文档/medvice/thesis/data/basic"
pathlib.Path(data_dir).mkdir(exist_ok=True, parents=True)

In [3]:
import json
import os

In [None]:
symptom_map_file = os.path.join(data_dir, "symptom_db.json")
with open(symptom_map_file, "w") as fp:
    json.dump(symptom_map, fp, indent=4)

In [None]:
condition_map_file = os.path.join(data_dir, "condition_db.json")
with open(condition_map_file, "w") as fp:
    json.dump(condition_map, fp, indent=4)

In [None]:
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models
import pandas as pd

In [None]:
#data_csv = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/06_18_nlice_plus/ai/output_med_ai_ext/symptoms/csv/symptoms.csv"
data_csv = "/home/gzr/文档/medvice/thesis/data/nlice100k/symptoms/csv/symptoms.csv"

In [None]:
def split_data(symptom_file, output_path, use_headers=False, train_split=0.8):
    symptom_columns = ['PATIENT', 'GENDER', 'RACE', 'ETHNICITY', 'AGE_BEGIN', 'AGE_END',
                       'PATHOLOGY', 'NUM_SYMPTOMS', 'SYMPTOMS']

    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

    if use_headers:
        df = pd.read_csv(symptom_file, names=symptom_columns)
    else:
        df = pd.read_csv(symptom_file,sep = "\t")
    df.index.name = "Index" 
    labels = df["PATHOLOGY"].drop(0)
    datas = df.drop(0)
    print(labels)
    #print(labels.value_counts())
    splitter = StratifiedShuffleSplit(1, train_size=train_split)
    train_index = None
    test_index = None
    for tr_idx, tst_index in splitter.split(datas, labels):
        train_index = tr_idx
        test_index = tst_index
        break

    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]

    train_op = os.path.join(output_path, "train.csv")
    test_op = os.path.join(output_path, "test.csv")
    train_df.to_csv(train_op)
    test_df.to_csv(test_op)
    return train_op, test_op


In [None]:
op_data_dir = os.path.join(data_dir, "data")
# split into train and test
train_file, test_file = split_data(data_csv, op_data_dir,True,train_split=0.9)

parsed_data_dir = os.path.join(op_data_dir, "parsed")

In [None]:
import pandas as pd
import numpy as np

In [10]:
#parse the train set
RACE_CODE = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
def parse_data_nlice_adv(
        filepath,
        conditions_db_json,
        symptoms_db_json,
        output_path,
        body_parts_json,
        excitation_enc_json,
        frequency_enc_json,
        nature_enc_json,
        vas_enc_json,
        onset_json,
        duration_json
        ):
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
    with open(symptoms_db_json) as fp:
        symptoms_db = json.load(fp)

    with open(conditions_db_json) as fp:
        conditions_db = json.load(fp)

    with open(body_parts_json) as fp:
        body_parts = json.load(fp)

    with open(excitation_enc_json) as fp:
        excitation_enc = json.load(fp)

    with open(frequency_enc_json) as fp:
        frequency_enc = json.load(fp)

    with open(nature_enc_json) as fp:
        nature_enc = json.load(fp)

    with open(vas_enc_json) as fp:
        vas_enc = json.load(fp)

    with open(onset_json) as fp:
        onset_enc = json.load(fp)

    with open(duration_json) as fp:
        duration_enc = json.load(fp)

    usecols = ['GENDER', 'RACE', 'AGE_BEGIN', 'PATHOLOGY', 'NUM_SYMPTOMS', 'SYMPTOMS']

    df = pd.read_csv(filepath, usecols=usecols)
    filename = filepath.split("/")[-1]

    # drop the guys that have no symptoms
    #f['NUM_SYMPTOMS'] = df['NUM_SYMPTOMS'].astype(int)
    #df = df[df['NUM_SYMPTOMS']> 0]
    df['LABEL'] = df.PATHOLOGY.apply(lambda v: conditions_db.get(v))
    df['RACE'] = df.RACE.apply(lambda v: RACE_CODE.get(v))
    df['GENDER'] = df.GENDER.apply(lambda gender: 0 if gender == 'F' else 1)
    df = df.rename(columns={'AGE_BEGIN': 'AGE'})
    # print(df.SYMPTOMS)
    df['SYMPTOMS'] = df.SYMPTOMS.apply(
        transform_symptoms_nlice_adv,
        symptom_db=symptoms_db,
        body_parts=body_parts,
        excitation_enc=excitation_enc,
        frequency_enc=frequency_enc,
        nature_enc=nature_enc,
        vas_enc=vas_enc,
        onset_enc = onset_enc,
        duration_enc = duration_enc
    )

    ordered_keys = ['LABEL', 'GENDER', 'RACE', 'AGE', 'SYMPTOMS']
    df = df[ordered_keys]
    df.index.name = "Index"
    output_file = os.path.join(output_path, "%s_sparse.csv" % filename)
    df.to_csv(output_file)

    return output_file

def transform_symptoms_nlice_adv(
        symptom_str,
        symptom_db,
        body_parts,
        excitation_enc,
        frequency_enc,
        nature_enc,
        vas_enc,
        onset_enc,
        duration_enc
):  
    # print(symptom_str)
    symptom_list = symptom_str.split(";")
    transformed_symptoms = []
    for _symp_def in symptom_list:
        # print(_symp_def)
        sym_list = _symp_def.split(":")
        if(len(sym_list)==9):
            _symptom, _nature, _location, _intensity, _duration, _onset, _exciation, _frequency, _ = _symp_def.split(":")

            _symptom_idx = symptom_db[_symptom] * 8

            _nature_idx = _symptom_idx + 1
            _nature_val = 1 if _nature == "" or _nature == "other" else nature_enc.get(_nature)

            _location_idx = _symptom_idx + 2
            _location_val = 1 if _location == "" or _location == "other" else body_parts.get(_location)

            _intensity_idx = _symptom_idx + 3
            _intensity_val = 1 if _intensity == "" else vas_enc.get(_intensity)

            _duration_idx = _symptom_idx + 4
            _duration_val = 0 if _duration == "" else duration_enc.get(_duration)

            _onset_idx = _symptom_idx + 5
            _onset_val = 0 if _onset == "" else onset_enc.get(_onset)

            _excitation_idx = _symptom_idx + 6
            _excitation_val = 1 if _exciation == "" else excitation_enc.get(_exciation)

            _frequency_idx = _symptom_idx + 7
            _frequency_val = 1 if _frequency == "" else frequency_enc.get(_frequency)

            to_transform = [
                "|".join([str(_symptom_idx), "1"]),
                "|".join([str(_nature_idx), str(_nature_val)]),
                "|".join([str(_location_idx), str(_location_val)]),
                "|".join([str(_intensity_idx), str(_intensity_val)]),
                "|".join([str(_excitation_idx), str(_excitation_val)]),
                "|".join([str(_frequency_idx), str(_frequency_val)])
            ]

            if _duration_val != 0:
                to_transform.append(
                    "|".join([str(_duration_idx), str(_duration_val)])
                )

            if _onset_val != 0:
                to_transform.append(
                    "|".join([str(_onset_idx), str(_onset_val)]),
                )

            transformed_str = ";".join(to_transform)

            transformed_symptoms.append(transformed_str)

    return ";".join(transformed_symptoms)

In [13]:
# parse the train set and let's train
parsed_train = "./data/basic/data/parsed/train_csv_sparse.csv"
symptom_map_file = "./data/basic/symptoms_db.json"
condition_map_file = "./data/basic/conditions_db.json"
dataset_train = "./data_preprossing/output/train_cleaned.csv"
dataset_test = "./data_preprossing/output/test_cleaned.csv"
symptoms_db_file = "./data/basic/symptoms_db.json"
body_parts_file = "./data/basic/body-parts-enc.json"
excitation_enc_file = "./data/basic/excitation_encoding.json"
frequency_enc_file = "./data/basic/frequency_encoding.json"
nature_enc_file = "./data/basic/nature_encoding.json"
vas_enc_file = "./data/basic/vas_encoding.json"
onset_enc_file = "./data/basic/onset_encoding.json"
duration_enc_file = "./data/basic/duration_encoding.json"
parsed_data_dir = "./data/basic/data/parsed"
        # excitation_enc_json,
        # frequency_enc_json,
        # nature_enc_json,
        # vas_enc_json
parsed_train = parse_data_nlice_adv(
    dataset_test,
    condition_map_file,
    symptoms_db_file,
    parsed_data_dir,
    body_parts_file,
    excitation_enc_file,
    frequency_enc_file,
    nature_enc_file,
    vas_enc_file,
    onset_enc_file,
    duration_enc_file
)

In [12]:
RACE_CODE = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
def _symptom_transform(val, labels, is_nlice=False):
    """
    Val is a string in the form: "symptom_0;symptom_1;...;symptom_n"
    :param val:
    :param labels:
    :return:
    """
    parts = val.split(";")
    if is_nlice:
        indices = []
        for item in parts:
            id, enc = item.split("|")
            label = labels.get(id)
            indices.append("|".join([label, enc]))
        res = ",".join(indices)
    else:
        indices = []
        for item in parts:
            symptom,_,_,_,_,_,_,_,_ = item.split(":")
            id = labels.get(symptom)
            if _ is None:
                raise ValueError("Unknown symptom")
            indices.append(id)
        res = ",".join(indices)
    return res


def parse_data(
        filepath,
        conditions_db_json,
        symptoms_db_json,
        output_path,
        is_nlice=False,
        transform_map=None,
        encode_map=None,
        reduce_map=None):

    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
    with open(symptoms_db_json) as fp:
        symptoms_db = json.load(fp)

    with open(conditions_db_json) as fp:
        conditions_db = json.load(fp)

    condition_labels = {code: idx for idx, code in enumerate(sorted(conditions_db.keys()))}
    symptom_map = {code: str(idx) for idx, code in enumerate(sorted(symptoms_db.keys()))}

    usecols = ['GENDER', 'RACE', 'AGE_BEGIN', 'PATHOLOGY', 'NUM_SYMPTOMS', 'SYMPTOMS']

    df = pd.read_csv(filepath, usecols=usecols)

    filename = filepath.split("/")[-1]

    # drop the guys that have no symptoms
    df = df[df.NUM_SYMPTOMS > 0]
    df['LABEL'] = df.PATHOLOGY.apply(lambda v: condition_labels.get(v))
    df['RACE'] = df.RACE.apply(lambda v: RACE_CODE.get(v))
    df['GENDER'] = df.GENDER.apply(lambda gender: 0 if gender == 'F' else 1)
    df = df.rename(columns={'AGE_BEGIN': 'AGE'})
    # if is_nlice:
    #     df['SYMPTOMS'] = df.SYMPTOMS.apply(
    #         _tranform_symptoms,
    #         transformation_map=transform_map,
    #         symptom_combination_encoding_map=encode_map,
    #         reduction_map=reduce_map)
    df['SYMPTOMS'] = df.SYMPTOMS.apply(_symptom_transform, labels=symptom_map, is_nlice=is_nlice)
    ordered_keys = ['LABEL', 'GENDER', 'RACE', 'AGE', 'SYMPTOMS']
    df = df[ordered_keys]
    df.index.name = "Index"

    output_file = os.path.join(output_path, "%s_sparse.csv" % filename)
    df.to_csv(output_file)

    return output_file

In [13]:
# parse the test set for evaluation
# parse the train set and let's train
dataset_train = "./data_preprossing/output/train_cleaned.csv"
dataset_test = "./data_preprossing/output/test_cleaned.csv"
parsed_data_dir = "./data/basic/data/parsed"
parsed_test = parse_data(
    dataset_train,
    condition_map_file,
    symptom_map_file,
    parsed_data_dir
)

In [3]:
# train with RF and then with NB
data_dir = os.curdir
print(data_dir)
op_data_dir = os.path.join(data_dir, "symtom_models")
rf_dir = os.path.join(op_data_dir, "output/rf")
rfparams = models.RFParams()
rfparams.n_estimators = 200
rfparams.max_depth = None
parsed_train = "./data/basic/data/parsed/train_cleaned.csv_sparse.csv"
symptom_map_file = "./data/basic/symptoms_db.json"
condition_map_file = "./data/basic/conditions_db.json"
pathlib.Path(rf_dir).mkdir(parents=True, exist_ok=True)

.


In [14]:
run_ok = runners.train_ai_med_rf(
    parsed_train,
    symptom_map_file,
    rf_dir,
    rfparams,
    "Basic AI-MED Run",
    "local-pc",
)

In [4]:
# train NB
nb_dir = os.path.join(op_data_dir, "output/nb")

run_ok = runners.train_ai_med_nb(
    parsed_train,
    symptom_map_file,
    nb_dir
)

[24, 29, 49, 38, 5, 3, 43, 48, 41, 37, 12, 31, 9, 42, 35, 28, 8, 14, 36, 4, 40, 19, 20, 21, 39, 53, 10, 25, 0, 44, 2, 17, 45]


In [4]:
rf_dir = os.path.join(op_data_dir, "output/rf")

run_ok = runners.train_ai_med_adv_rf(
    parsed_train,
    symptom_map_file,
    rf_dir
)

[24, 29, 49, 38, 5, 3, 43, 48, 41, 37, 12, 31, 9, 42, 35, 28, 8, 14, 36, 4, 40, 19, 20, 21, 39, 53, 10, 25, 0, 44, 2, 17, 45]


In [None]:
# we'll evaluate on the unseen data ..

In [None]:
import joblib

In [None]:

nb_data = joblib.load(nb_dir)

In [None]:
nb_clf = nb_data.get('clf')

In [None]:
vec=[[1,0,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,126,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

In [None]:
import numpy as np

In [None]:
vec = np.array(vec)

In [None]:
from scipy.sparse import csc_matrix

In [None]:
vec = csc_matrix(vec)

In [None]:
res = nb_clf.predict_proba(vec)

In [None]:
res