In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')  

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pytz
from pytz import common_timezones, all_timezones
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from datetime import datetime
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
matplotlib.style.use('fivethirtyeight')
matplotlib.style.use('seaborn-talk')

This ipython notebook will illustrate the matching methodology we will use to match patients in the SEER-Medicare linked database. The comparison between patient outcomes between those patients who receive proton beam therapy vs traditional radiation therapy requires as a first step a matching of the two different groups. One method to perform this matching is to use Propensity Score Matching. Here we are going to illustrate a more direct method of matching using KDTrees, a data structure that allows for efficient finding of nearest neighbors in high-dimenional space.

We will first identify all patients with left-sided breast cancer, using the following primary site codes:
(C500-506, C508-C509) AND only consider left breast cases: LATERALITY == "Left: origin of primary" coded as 2.
Then we will split the above patient population into two groups as follows: The treatment group, defined as the patients receiving any form of radiation the control group, defined as the patients who did not receive any form of radiation Operationally, these groups are defined by looking at the column RADIATN, which has values:


| Code	| Description |
|:---------:|:--------------:|
|0	| None; diagnosed at autopsy |
|1	|Beam radiation |
|2	|Radioacative implants|
|3	|Radioisotopes|
|4	|Combination of 1 with 2 or 3|
|5	|Radiation, NOS - method or source not specified|
|6	|Other radiation (1973-1987 cases only)|
|7	|Patient or patient's guardian refused radiation therapy|
|8	|Radiation recommened, unknown if administered|
|9	|Unknown if radiation administered|


So that a value of 0 defines the control group, and the values (1,2,3,4,5,6) define the treatment group. We will then perform matching between the resulting treatment and control groups.

In [3]:
import seerhelp
breastfiles = seerhelp.get_breastfiles()
breastfiles

['SEER_1973_2013_TEXTDATA\\incidence\\yr1973_2013.seer9\\BREAST.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr1992_2013.sj_la_rg_ak\\BREAST.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr2000_2013.ca_ky_lo_nj_ga\\BREAST.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr2005.lo_2nd_half\\BREAST.txt']

In [4]:
import seerhelp
breastfiles = seerhelp.get_breastfiles()
breastfiles

dfbreast0 = seerhelp.make_clean_dataframe(breastfiles[0])
dfbreast1 = seerhelp.make_clean_dataframe(breastfiles[1])
dfbreast2 = seerhelp.make_clean_dataframe(breastfiles[2])
dfbreast3 = seerhelp.make_clean_dataframe(breastfiles[3])

dfbreast = pd.concat([dfbreast0, dfbreast1, dfbreast2,
                     dfbreast3], ignore_index=True)

#dfother = pd.concat([dfother0, dfother1,dfother3], ignore_index=True)

print(dfbreast.shape)

del dfbreast0
del dfbreast1
del dfbreast2
del dfbreast3

(1448798, 141)


In [5]:
dfbreast.LATERAL.value_counts()

Left origin of primary                                                        730569
Right origin of primary                                                       702454
Paired site, but no information concerning laterality; midline tumor           12729
Only one side involved, right or left origin unspecified                        1667
Bilateral involvement, lateral origin unknown; stated to be single primary      1379
Name: LATERAL, dtype: int64

In [7]:
leftfilter = dfbreast.LATERAL.str.contains('left origin of primary',case=False,na=False)
leftfilter.sum()

730569

In [8]:
mask = (dfbreast['CSTUMSIZ'] != "Unknown; size not stated; not stated in patient record") & \
(dfbreast['CSTUMSIZ'] != "Microscopic focus or foci only; no size of focus is given") & \
(dfbreast['CSTUMSIZ'] != "989 millimeters or larger") & \
(dfbreast['CSTUMSIZ'] != "Not applicable") & \
(dfbreast['AGE_DX'] != "Unknown age") & \
(dfbreast['srv_time_mon_flag'] == "Complete dates are available and there are more than 0 days of survival") & \
(dfbreast['YEAR_DX'] >= 2004) & \
(dfbreast['CSTUMSIZ'] != "Described as less than 1 cm") & \
(dfbreast['CSTUMSIZ'] != "Described as less than 2 cm") & \
(dfbreast['CSTUMSIZ'] != "Described as less than 3 cm") & \
(dfbreast['CSTUMSIZ'] != 'Indicates no msas or no tumor found; for example, when a tumor of a stated primary site is not found, but the tumor has metastasized') & \
(dfbreast['CSTUMSIZ'] != "Described as less than 4 cm") & \
(dfbreast['CSTUMSIZ'] != "Described as less than 5 cm") & \
(dfbreast['CSTUMSIZ'] != "Not applicable") & \
(dfbreast['YR_BRTH'] != 'Unknown year of birth') & \
(dfbreast['CSTUMSIZ'] != 'Not applicable') & \
(dfbreast['CSTUMSIZ'] != 996) & \
(dfbreast['CSTUMSIZ'] != 997) & \
(dfbreast['CSTUMSIZ'] != 998) & \
(dfbreast['REC_NO'] == 1) & \
(leftfilter)

dfbreast = dfbreast[mask]




In [11]:
non_rad = dfbreast.RADIATN.str.contains('None',case=False,na=False)
refused_rad = dfbreast.RADIATN.str.contains('refused',case=False,na=False)
unknown_rad = dfbreast.RADIATN.str.contains('Unknown',case=False,na=False)