# Step 3: Initial Classifiers and Evaluation Results

#### Implement the data cleaning operations and feature selection tasks necessary for developing your classifiers

* Load data tables:

In [1]:
from data_table import *
from data_learn import *
from data_eval import *
from data_util import *

In [2]:
inpatient_charges_table = DataTable(["DRG Definition","Provider Id","Hospital Name","Address","City",
                                    "State","ZIP Code","Hospital Referral Region Description", "Total Discharges" , 
                                    "Average Covered Charges" , "Average Total Payments ","Average Medicare Payments"])

hospital_info_table = DataTable(["Provider ID","Hospital Name","Address","City","State","ZIP Code","County Name",
                                "Phone Number","Hospital Type","Hospital Ownership","Emergency Services",
                                "Meets criteria for meaningful use of EHRs","Hospital overall rating","Hospital overall rating footnote",
                                "Mortality national comparison","Mortality national comparison footnote",
                                "Safety of care national comparison","Safety of care national comparison footnote",
                                "Readmission national comparison","Readmission national comparison footnote",
                                "Patient experience national comparison","Patient experience national comparison footnote",
                                "Effectiveness of care national comparison","Effectiveness of care national comparison footnote",
                                "Timeliness of care national comparison","Timeliness of care national comparison footnote",
                                "Efficient use of medical imaging national comparison","Efficient use of medical imaging national comparison footnote"])                               
inpatient_charges_table.load("inpatientCharges.csv")
hospital_info_table.load("Hospital General Information.csv")

* Combine Tables:

In [3]:
combined_table = DataTable.combine(inpatient_charges_table, hospital_info_table, ["Hospital Name", "Address", "City", "State", "ZIP Code"])
del combined_table[0]
print(combined_table.row_count())

106683


* Drop unnecessary atributes

In [4]:
print(combined_table.columns())

['DRG Definition', 'Provider Id', 'Hospital Name', 'Address', 'City', 'State', 'ZIP Code', 'Hospital Referral Region Description', 'Total Discharges', 'Average Covered Charges', 'Average Total Payments ', 'Average Medicare Payments', 'Provider ID', 'County Name', 'Phone Number', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 'Meets criteria for meaningful use of EHRs', 'Hospital overall rating', 'Hospital overall rating footnote', 'Mortality national comparison', 'Mortality national comparison footnote', 'Safety of care national comparison', 'Safety of care national comparison footnote', 'Readmission national comparison', 'Readmission national comparison footnote', 'Patient experience national comparison', 'Patient experience national comparison footnote', 'Effectiveness of care national comparison', 'Effectiveness of care national comparison footnote', 'Timeliness of care national comparison', 'Timeliness of care national comparison footnote', 'Efficient use of medical i

In [5]:
combined_table.drop(['Hospital Referral Region Description', 'Total Discharges', 'Average Covered Charges','Average Medicare Payments',
                     'Provider Id', 'County Name', 'Phone Number', 'Hospital Type', 'Hospital Ownership', 'Emergency Services', 
                     'Meets criteria for meaningful use of EHRs','Hospital overall rating footnote', 'Mortality national comparison', 
                     'Mortality national comparison footnote', 'Safety of care national comparison', 'Safety of care national comparison footnote', 
                     'Readmission national comparison', 'Readmission national comparison footnote', 'Patient experience national comparison', 
                     'Patient experience national comparison footnote', 'Effectiveness of care national comparison', 
                     'Effectiveness of care national comparison footnote', 'Timeliness of care national comparison', 'Timeliness of care national comparison footnote', 
                     'Efficient use of medical imaging national comparison', 'Efficient use of medical imaging national comparison footnote', 'ZIP Code',
                    "Address", 'City', "Provider ID"])

In [6]:
print(combined_table.columns())

['DRG Definition', 'Hospital Name', 'State', 'Average Total Payments ', 'Hospital overall rating']


* Remove Rows with Missing values in the overall rating section

In [7]:
print(combined_table.row_count())

106683


In [8]:
for row in range(combined_table.row_count()):
    if combined_table[row]['Hospital overall rating'] == 'Not Available':
        combined_table[row]['Hospital overall rating'] = ''
cleaned_table = remove_missing(combined_table, ['Hospital overall rating'])

In [9]:
print(cleaned_table.row_count())

105044


* Replace missing values in the average total payment section

In [10]:
for row in range(cleaned_table.row_count()):
    temp = cleaned_table[row]['Average Total Payments ']
    temp = temp.replace('$', '')
    num = cleaned_table.convert_numeric(temp)
    cleaned_table[row]['Average Total Payments '] = num

In [11]:
avg = lambda xs : None if not len(xs) else sum(xs) / len(xs)
cleaned_table = replace_missing(cleaned_table, 'Average Total Payments ', ['State'], avg)

* Combine Hospital Names and cost to calculate average cost

In [12]:
print(len(distinct_values(cleaned_table, 'Hospital Name')))

1919


In [13]:
# hospital_table = DataTable(cleaned_table.columns())
# # key = name, element = total cost
# dict_names = {}
# # key = name, element = num instances
# count = {}
# # key = name, element = row num
# row_dict = {}
# for row in range(cleaned_table.row_count()):
#     name = cleaned_table[row]["Hospital Name"]
#     if name in dict_names:
#         dict_names[name] += cleaned_table[row]["Average Total Payments "]
#         count[name] += 1
#     else:
#         dict_names[name] = cleaned_table[row]["Average Total Payments "]
#         count[name] = 1
#         row_dict[name] = row
# keys_list = list(dict_names.keys())
# for key in keys_list:
#     avg = dict_names[key] / count[key]
#     temp = cleaned_table[row_dict[key]]
#     temp["Average Total Payments "] = avg
#     hospital_table.append(temp.values())

In [14]:
cleaned_table.drop(['DRG Definition'])

* Normalize Average Total Payments

In [15]:
normalize(cleaned_table, "Average Total Payments ")

* Add row that classifies the state the hospital is into a region

In [16]:
state_to_region_dict = {
    'WA': 'West', 'OR': 'West', 'CA': 'West', 'NV': 'West',
    'ID': 'West', 'MT': 'West', 'WY': 'West', 'UT': 'West',
    'CO': 'West', 'AK': 'West', 'HI': 'West', 'ME': 'Northeast',
    'VT': 'Northeast', 'NY': 'Northeast', 'NH': 'Northeast',
    'MA': 'Northeast', 'RI': 'Northeast', 'CT': 'Northeast',
    'NJ': 'Northeast', 'PA': 'Northeast', 'ND': 'Midwest',
    'SD': 'Midwest', 'NE': 'Midwest', 'KS': 'Midwest',
    'MN': 'Midwest', 'IA': 'Midwest', 'MO': 'Midwest', 'WI': 'Midwest',
    'IL': 'Midwest', 'MI': 'Midwest', 'IN': 'Midwest', 'OH': 'Midwest',
    'WV': 'South', 'DC': 'South', 'MD': 'South',
    'VA': 'South', 'KY': 'South', 'TN': 'South', 'NC': 'South',
    'MS': 'South', 'AR': 'South', 'LA': 'South', 'AL': 'South',
    'GA': 'South', 'SC': 'South', 'FL': 'South', 'DE': 'South',
    'AZ': 'Southwest', 'NM': 'Southwest', 'OK': 'Southwest',
    'TX': 'Southwest'}
temp = cleaned_table.columns()
temp.append('Region')
new_table = DataTable(temp)
for row in range(cleaned_table.row_count()):
    state = cleaned_table[row]['State']
    region = state_to_region_dict[state]
    row = cleaned_table[row].values()
    row.append(region)
    new_table.append(row)

* Change Hospital Name to Numbers

In [17]:
# key = name, element = number
hosp_name_dict = {}
for row in range(new_table.row_count()):
    name = new_table[row]["Hospital Name"]
    if name in hosp_name_dict:
        new_table[row]["Hospital Name"] = hosp_name_dict[name]
    else:
        hosp_name_dict[name] = row
        new_table[row]["Hospital Name"] = hosp_name_dict[name]

* Drop State column

In [18]:
new_table.drop(['State'])

In [19]:
print(new_table)

  Hospital Name    Average Total Payments     Hospital overall rating  Region
---------------  -------------------------  -------------------------  ---------
              0                0.0225611                            3  South
              0                0.0161431                            3  South
              0                0.0551453                            3  South
              0                0.0281214                            3  South
              0                0.0139771                            3  South
              0                0.0106191                            3  South
              0                0.0160123                            3  South
              0                0.0144932                            3  South
              0                0.0243273                            3  South
              0                0.0651321                            3  South
              0                0.0377956                            3  

#### Implement two basic classifiers for your datasets: k-nn and naive bayes.

* KNN

First using holdout to create test and train sets then the 5 nearest neighbors with majority vote:

In [20]:
size = 500
sets = holdout(new_table, size)

In [21]:
# train
train = sets[0]
# test
test = sets[1]

confusion_matrix = knn_eval(train, test, majority_vote, 5, "Region", ['Average Total Payments '], ['Hospital overall rating', "Hospital Name"])

In [22]:
print(confusion_matrix)

actual       South    West    Southwest    Northeast    Midwest
---------  -------  ------  -----------  -----------  ---------
South          134       0            0            0          2
West             2      75            0            3          0
Southwest        0       0           52            0          0
Northeast        0       0            0          125          0
Midwest          0       2            0            1        104


In [23]:
size = (new_table.row_count() * .8)
sets = holdout(new_table, int(size))

KeyboardInterrupt: 

In [None]:
# train
train = sets[0]

# test
test = sets[1]

naive_confusion = naive_bayes_eval(train, test, 'Region', ['Average Total Payments '], ['Hospital overall rating', "Region", "Hospital Name"])

In [None]:
print(naive_confusion)

actual       South    West    Southwest    Northeast    Midwest
---------  -------  ------  -----------  -----------  ---------
South          403      24            0           14          0
West           131     121            0           33          0
Southwest      168      24            0            9          0
Northeast      164      76            0           25          0
Midwest        289      37            0           17          0
