In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pytz
from pytz import common_timezones, all_timezones
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from datetime import datetime
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
matplotlib.style.use('fivethirtyeight')
matplotlib.style.use('seaborn-talk')
import os
import collections

# Random Forest model for OCM historical episodes

In this script is an example of how to train and evaluate the performance of a random forest model for predicitng episode prices using OCM CMMI historical claims data.

## <font color="steelblue">Base Episodes</font>

We can illustrate the procedure to train and evalute an RF model with these 17,500 base episodes:

* site1: 2,588
* site2: 2,325
* site3: 3,055
* site4: 516
* site5: 1,428
* site6: 2,144
* site7: 4,949

## <font color="steelblue">OCM Prediction Model Covariates:</font>

* Age/Sex
* Cancer type
* Chemotherapy drugs taken/administered during the episode (breast cancer only)
* Receipt of cancer-related surgery
* Part D eligibility and dual eligibility for Medicare and Medicaid
* Receipt of radiation therapy
* Receipt of bone marrow transplant
* Clinical trial participation
* Comorbidities
* History of prior chemotherapy use
* Insitutional status
* Episode length
* Geographic location/Hospital Referral Region (use lat/lng pairs derived from zipcode)

## Add Episode begin date, Episode end date, and DOB


In [2]:
# Change to the directory where the historical episode files are housed

preDouble = "\\\\iobsdc01\\SharedDocs\\Development\\Clients\\2012 - 2015"
new = os.chdir(preDouble)
#print(preDouble)

# read in the files

ucs = pd.read_table("OCM_143_50264_episodes_base_20160804.txt", sep="|")
oha = pd.read_table("OCM_414_50280_episodes_base_20160812.txt", sep="|")
hfci = pd.read_table("OCM_431_50233_episodes_base_20160902.txt", sep="|")
iha = pd.read_table("OCM_057_50173_episodes_base_20160722.txt", sep="\t")
nmcc = pd.read_table("OCM_108_50279_episodes_base_20160812.txt", sep="|")
dpn = pd.read_table("OCM_024_50116_episodes_base_20160916.txt", sep="|")
somc = pd.read_table("OCM_514_50286_episodes_base_20160715.txt", sep="|")
ucs["Practice"] = "UCS"
oha["Practice"] = "OHA"
hfci["Practice"] = "HFCI"
iha["Practice"] = "IHA"
nmcc["Practice"] = "NMCC"
dpn["Practice"] = "DPN"
somc["Practice"] = "SOMC"

# combine all the data into a single dataframe

df = pd.concat([ucs,oha,hfci,iha,nmcc,dpn,somc]).copy()

In [3]:
print(df.columns)

Index(['BENE_ID', 'BENE_HICN', 'FIRST_NAME', 'LAST_NAME', 'SEX', 'DOB', 'AGE',
       'DOD', 'ZIPCODE', 'EP_ID', 'EP_BEG', 'EP_END', 'EP_LENGTH',
       'CANCER_TYPE', 'RECON_ELIG', 'DUAL_PTD_LIS', 'INST', 'RADIATION',
       'HCC_GRP', 'HRR_REL_COST', 'SURGERY', 'CLINICAL_TRIAL', 'BMT',
       'CLEAN_PD', 'PTD_CHEMO', 'ACTUAL_EXP', 'BASELINE_PRICE',
       'EXPERIENCE_ADJ', 'Practice'],
      dtype='object')


# <font color="steelblue">First turn the zipcodes into (lat/lng/elev) triples</font>

In [4]:
from geopy.distance import vincenty
from geopy.distance import great_circle

df['goodzip'] = df.ZIPCODE.map("{:05}".format)

from geopy.geocoders import Nominatim
geolocator = Nominatim()
import geocoder
def make_zip_dicts(t):
    """takes the list t and returns a 3 
    dictionaries: d_zip_lat, d_zip_lng, and
    d_zip_elev."""
    d_zip_lat = {}
    d_zip_lng = {}
    d_zip_elev = {}
    for z in t:
        try:
            location = geolocator.geocode(z)
            g = geocoder.elevation((location.latitude,location.longitude))
            d_zip_lat[z] = location.latitude
            d_zip_lng[z] = location.longitude
            d_zip_elev[z] = g.elevation
        except:
            d_zip_lat[z] = np.nan
            d_zip_lng[z] = np.nan
            d_zip_elev[z] = np.nan
    return d_zip_lat, d_zip_lng, d_zip_elev

d_zip_lat, d_zip_lng, d_zip_elev = make_zip_dicts([x for x in df['goodzip'].value_counts().index])


df['latitude'] = df['goodzip'].replace(d_zip_lat).copy()
df['longitude'] = df['goodzip'].replace(d_zip_lng).copy()
df['elevation'] = df['goodzip'].replace(d_zip_elev).copy()