In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')  

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pytz
from pytz import common_timezones, all_timezones
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
from datetime import datetime
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
matplotlib.style.use('fivethirtyeight')
matplotlib.style.use('seaborn-talk')

This ipython notebook will illustrate the matching methodology we will use to match patients in the SEER-Medicare linked database. The comparison between patient outcomes between those patients who receive proton beam therapy vs traditional radiation therapy requires as a first step a matching of the two different groups. One method to perform this matching is to use Propensity Score Matching. Here we are going to illustrate a more direct method of matching using KDTrees, a data structure that allows for efficient finding of nearest neighbors in high-dimenional space.

We will first identify all patients that have been diagonised with some type of head and neck cancer as defined by the following primary site codes: Nasopharynx: (C110, C111, C112, C113, C118, C119).
Then we will split the above patient population into two groups as follows: The treatment group, defined as the patients receiving any form of radiation the control group, defined as the patients who did not receive any form of radiation Operationally, these groups are defined by looking at the column RADIATN, which has values:


| Code	| Description |
|:---------:|:-------------:|
|0	|None; diagnosed at autopsy|
|1	|Beam radiation |
| 2 |	Radioacative implants |
|3	|Radioisotopes|
|4	|Combination of 1 with 2 or 3 |
|5	|Radiation, NOS - method or source not specified |
|6	|Other radiation (1973-1987 cases only)|
|7	|Patient or patient's guardian refused radiation therapy |
|8	|Radiation recommended, unknown if administered |
|9	|Unknown if radiation administered |

So that a value of 0 defines the control group, and the values (1,2,3,4,5,6) define the treatment group. We will then perform matching between the resulting treatment and control groups.

In [3]:
import seerhelp
otherfiles = seerhelp.get_otherfiles()
otherfiles

dfother0 = seerhelp.make_clean_dataframe(otherfiles[0])
dfother1 = seerhelp.make_clean_dataframe(otherfiles[1])
dfother2 = seerhelp.make_clean_dataframe(otherfiles[2])
dfother3 = seerhelp.make_clean_dataframe(otherfiles[3])

dfother = pd.concat([dfother0, dfother1, dfother2,
                     dfother3], ignore_index=True)

#dfother = pd.concat([dfother0, dfother1,dfother3], ignore_index=True)

print(dfother.shape)

del dfother0
del dfother1
del dfother2
del dfother3

(1581838, 141)


In [5]:
dfother.PRIMSITE.value_counts()

Unknown primary site                                                     170561
Thyroid gland                                                            167854
Skin of trunk                                                            164671
Skin of upper limb and shoulder                                          131779
Skin of lower limb and hip                                                94148
Skin of other and unspecified parts of face                               84382
Bone marrow                                                               82123
Cerebral meninges                                                         58481
Skin of scalp and neck                                                    44256
Pituitary gland                                                           32340
Frontal lobe                                                              27753
Skin, NOS                                                                 26011
Base of tongue, NOS                     

In [6]:
dfnasoorig = dfother[dfother.PRIMSITE.str.contains('nasopharynx',case=False,na=False)]

dfnasoorig.PRIMSITE.value_counts()


Nasopharynx, NOS                     8670
Posterior wall of nasopharynx        1230
Lateral wall of nasopharynx          1049
Overlapping lesion of nasopharynx     604
Superior wall of nasopharynx          212
Anterior wall of nasopharynx          198
Name: PRIMSITE, dtype: int64

In [7]:
mask = (dfnasoorig['CSTUMSIZ'] != "Unknown; size not stated; not stated in patient record") & \
(dfnasoorig['CSTUMSIZ'] != "Microscopic focus or foci only; no size of focus is given") & \
(dfnasoorig['CSTUMSIZ'] != "989 millimeters or larger") & \
(dfnasoorig['CSTUMSIZ'] != "Not applicable") & \
(dfnasoorig['AGE_DX'] != "Unknown age") & \
(dfnasoorig['srv_time_mon_flag'] == "Complete dates are available and there are more than 0 days of survival") & \
(dfnasoorig['YEAR_DX'] >= 2004) & \
(dfnasoorig['CSTUMSIZ'] != "Described as less than 1 cm") & \
(dfnasoorig['CSTUMSIZ'] != "Described as less than 2 cm") & \
(dfnasoorig['CSTUMSIZ'] != "Described as less than 3 cm") & \
(dfnasoorig['CSTUMSIZ'] != 'Indicates no msas or no tumor found; for example, when a tumor of a stated primary site is not found, but the tumor has metastasized') & \
(dfnasoorig['CSTUMSIZ'] != "Described as less than 4 cm") & \
(dfnasoorig['CSTUMSIZ'] != "Described as less than 5 cm") & \
(dfnasoorig['CSTUMSIZ'] != "Not applicable") & \
(dfnasoorig['YR_BRTH'] != 'Unknown year of birth') & \
(dfnasoorig['CSTUMSIZ'] != 'Not applicable') & \
(dfnasoorig['CSTUMSIZ'] != 996) & \
(dfnasoorig['CSTUMSIZ'] != 997) & \
(dfnasoorig['CSTUMSIZ'] != 998) & \
(dfnasoorig['REC_NO'] == 1) 

dfnaso = dfnasoorig[mask]

