In [1]:
# Reading the Patient Feature and Id Json file

import json
import math
from mlxtend.preprocessing import OnehotTransactions
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
import pandas as pd
import numpy as np

with open('Data/Final_Datasets/Total_Patients_Personal_Details_Features_File.txt') as data_file:    
    patient_data = json.load(data_file)   # yaml.safe_loads produces strings rather than unicode strings as in json.load

with open('Data/Final_Datasets/Selected_Patient_Id.txt') as data_file:    
    patient_id = json.load(data_file)   # yaml.safe_loads produces strings rather than unicode strings as in json.load
    
print len(patient_data)
print "Loading Files Done"

8011
Loading Files Done


In [2]:
# Testing for Datasets

# Checking is still some case of no Side Effect being mentioned
for patient in patient_data.keys():
    for drug in patient_data[patient]["Treatments List"].keys():
        if patient_data[patient]["Treatments List"][drug]["Side Effects"] == -1:
            print "You are Screwed"

print "Done with the Test"
total_eval_count = 0

# Counting total number of evaluations
for patient in patient_data.keys():
    for drug in patient_data[patient]["Treatments List"].keys():
            total_eval_count = total_eval_count + 1

print "Total Number of evaluations",
print total_eval_count

Done with the Test
Total Number of evaluations 42075


In [3]:
train_data = {}
patient_condition_data = {}

# Making a vocabulary of all the conditions: primary and other listed by all the patients in dataset.
# This vocabulary is stored in train_data. It is a dictionary with key as condition name and value as 1.0 if condition is 
# from a class of primary condition of patients and 0.5 if the condition is from class of other conditions.

# Concatenating both primary and secodary conditions for each patient and assiging the concatenated list of conditions 
# for each patient in patient_condition_data. Key is patient and value is the concatenated list.
# While making the concatenated list for each patient, we add "P_" at start of primary condition id 
# and "O_" for other condition id. This is to differentiate as there would a primary condition with id 1 and also an 
# other condition with id 1, so to differentiate between same id primary and other conditions, add these strings.

for patient in patient_data.keys():
    condition_list = []
    feature = patient_data[patient]["Primary Condition"]
    if feature != -1:
        condition_list.append("P_" + str(feature) )
        train_data[ "P_" + str(feature) ] = 1             # Assigning score 1 for primary condition
    
    feature = patient_data[patient]["Other conditions"]
    if feature != -1:
        for item in feature:
            condition_list.append( "O_" + str(item))
            train_data[ "O_" + str(item) ] = 0.5          # Assigning score 0.5 for other conditions
 
    patient_condition_data[ patient ] = condition_list

f = open("Data/Final_Datasets/Condition_Features/Patient_Condition_Data.txt","w")
f.write( json.dumps(patient_condition_data, indent=3, sort_keys=True) )
f.close()

f = open("Data/Final_Datasets/Condition_Features/Conditions_Vocaulary.txt", "w")
f.write( json.dumps(train_data, indent=3, sort_keys=True) )
f.close()

print len(patient_condition_data)
print len(train_data)

8011
2109


In [4]:
# Generating a vectorised representation for each patient's concatenated condition list
# Vector is of size of length of vocabualary i.e. len(train_data.keys())
# Every condition from patient's list of conditions i.e. patient_condition_data[patient] is compared 
# with list of conditions in vocabulary. 
# For those conditions of vocab present in patient's list condition, assign the score 1 or 0.5 depending on whether its primary or other condition
# Else assign zero for that component of vector.

vectorised_data ={}
for patient in patient_condition_data:
    temp = []
    for condition in train_data.keys():
        if condition in patient_condition_data[patient]:
            temp.append( train_data[condition] )
        else:
            temp.append(0)
    vectorised_data[ patient ] = temp

f = open("Data/Final_Datasets/Condition_Features/Vectorised_Patient_Condition.txt", "w")
f.write( json.dumps(vectorised_data, indent=3, sort_keys=True) )
f.close()    