In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')  

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pytz
from pytz import common_timezones, all_timezones
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from datetime import datetime
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
matplotlib.style.use('fivethirtyeight')
matplotlib.style.use('seaborn-talk')

This ipython notebook will illustrate the matching methodology we will use to match patients in the SEER-Medicare linked database. The comparison between patient outcomes for those patients who receive proton beam therapy vs traditional radiation therapy requires as a first step a matching of the two different groups. One method to perform this matching is to use Propensity Score Matching. Here we are going to illustrate a more direct method of matching using KDTrees, a data structure that allows for efficient finding of nearest neighbors in high-dimenional space.

We will first identify all patients with prostate cancer (using the primary site code C619). Then we will split the above patient population into two groups as follows: The treatment group, defined as the patients receiving any form of radiation; and the control group, defined as the patients who did not receive any form of radiation. Operationally, these groups are defined by looking at the column RADIATN, which has values:

| Code	| Description |
|:---------:|:-------------:|
|0|	None; diagnosed at autopsy |
|1|	Beam radiation |
|2|	Radioacative implants |
|3|	Radioisotopes |
|4|	Combination of 1 with 2 or 3 |
|5|	Radiation, NOS - method or source not specified |
|6|	Other radiation (1973-1987 cases only) |
|7|	Patient or patient's guardian refused radiation therapy |
|8|	Radiation recommened, unknown if administered |
|9|	Unknown if radiation administered |

So that a value of 0 defines the control group, and teh values (1,2,3,4,5,6) define the treatment group. We will then perform matching between the resulting treatment and control groups.

In [4]:
import seerhelp
malegenfiles = seerhelp.get_malegenfiles()
malegenfiles

['SEER_1973_2013_TEXTDATA\\incidence\\yr1973_2013.seer9\\MALEGEN.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr1992_2013.sj_la_rg_ak\\MALEGEN.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr2000_2013.ca_ky_lo_nj_ga\\MALEGEN.TXT',
 'SEER_1973_2013_TEXTDATA\\incidence\\yr2005.lo_2nd_half\\MALEGEN.txt']

In [5]:
import seerhelp
malegenfiles = seerhelp.get_malegenfiles()


dfmalegen0 = seerhelp.make_clean_dataframe(malegenfiles[0])
dfmalegen1 = seerhelp.make_clean_dataframe(malegenfiles[1])
dfmalegen2 = seerhelp.make_clean_dataframe(malegenfiles[2])
dfmalegen3 = seerhelp.make_clean_dataframe(malegenfiles[3])

dfmalegen = pd.concat([dfmalegen0, dfmalegen1, dfmalegen2,
                     dfmalegen3], ignore_index=True)

#dfother = pd.concat([dfother0, dfother1,dfother3], ignore_index=True)

print(dfmalegen.shape)

del dfmalegen0
del dfmalegen1
del dfmalegen2
del dfmalegen3

(1214943, 141)


In [6]:
dfmalegen.PRIMSITE.value_counts()

Prostate gland                                  1152617
Testis, NOS                                       32028
Descended testis                                  16107
Penis, NOS                                         6139
Glans penis                                        2526
Scrotum, NOS                                       1756
Prepuce                                            1346
Undescended testis                                  883
Body of penis                                       512
Spermatic cord                                      500
Overlapping lesion of penis                         267
Overlapping lesion of male genital organs            91
Male genital organs, NOS                             83
Epididymis                                           59
Other specified parts of male genital organs         29
Name: PRIMSITE, dtype: int64

In [7]:
dfprostate = dfmalegen[dfmalegen.PRIMSITE.str.contains('prostate',case=False)]
dfprostate.PRIMSITE.value_counts()

Prostate gland    1152617
Name: PRIMSITE, dtype: int64