In [2]:
#Executive Summary

#The goal of this project is to use classification models to predict the probability of a person testing positive for cancer
#based on 20 or so socioeconomic and health variables. My data was taken from IPUMS which provides census and survey data
#IPUMS is a massive database that collects American and international census records and surveys. Their data includes health,
#population, housing, education, and historical data. My metrics are confusion matrix, sensitivity, and accuracy. 

In [174]:
#Here is my presentation
#https://docs.google.com/presentation/d/1PAKrvbSBUsXfa_DXmC4J_ZrADge-Ve3rQv6icB3FCCM/edit#slide=id.gfff6384ca7_1_22

In [1]:
import pandas as pd

In [2]:
#Originally, I only wanted to include breast cancer as my binary target value. However, I realized my data is extreamely unbalanced.
#Only about 1.5 percent of respondents have cancer while 98.5 percent do not. Therefore, I decided to make my y-value be a binary
#value where those with any kind of cancer would be 1, while those without cancer would be 0.

#I imported my original data nhis_00013.csv

In [3]:
data = pd.read_csv('./Data_Files/nhis_00013.csv.gz')

In [4]:
#I have about 112 columns, I need to delete most of them and find the relevant ones using Tableau visualization as my feature selection.
#Classification values are not suited for correlation
data.columns

Index(['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'HHWEIGHT', 'LIVINGQTR',
       'PERNUM', 'NHISPID', 'HHX',
       ...
       'ALCAMT', 'CIGDAYMO', 'SMOKFREQNOW', 'STRONGFWK', 'HRSLEEP',
       'WORFEELEVL', 'DEPFREQ', 'UNHAPPY', 'MORTELIG', 'MORTWTSA'],
      dtype='object', length=112)

In [5]:
#CNBLAD - Ever had cancer: Bladder

In [6]:
data.shape

(248132, 112)

In [7]:
#For the rest of the cells, those who test positive for cancer will be represented by 1, while those testing negative for cancer will be 0.

In [8]:
data['bladder_cancer'] = data.loc[:, 'CNBLAD']

In [9]:
data['bladder_cancer'] = data['bladder_cancer'].replace([0, 1], 0)

In [10]:
data['bladder_cancer'] = data['bladder_cancer'].replace(2, 1)

In [11]:
data = data[data['bladder_cancer'] != 9]
data = data[data['bladder_cancer'] != 8]
data = data[data['bladder_cancer'] != 7]

In [12]:
data['bladder_cancer'].value_counts()

0    247836
1       256
Name: bladder_cancer, dtype: int64

In [13]:
#CNBLOD - Ever had cancer: Blood

In [14]:
data['blood_cancer'] = data.loc[:, 'CNBLOD']

In [15]:
data['blood_cancer'] = data['blood_cancer'].replace([0, 1], 0)
data['blood_cancer'] = data['blood_cancer'].replace(2, 1)

In [16]:
data = data[data['blood_cancer'] != 9]
data = data[data['blood_cancer'] != 8]
data = data[data['blood_cancer'] != 7]

In [17]:
data['blood_cancer'].value_counts()

0    248035
1        57
Name: blood_cancer, dtype: int64

In [18]:
#CNBONE - Ever had cancer: Bone

In [19]:
data['bone_cancer'] = data.loc[:, 'CNBONE']

In [20]:
data['bone_cancer'] = data['bone_cancer'].replace([0, 1], 0)
data['bone_cancer'] = data['bone_cancer'].replace(2, 1)

In [21]:
data = data[data['bone_cancer'] != 9]
data = data[data['bone_cancer'] != 8]
data = data[data['bone_cancer'] != 7]

In [22]:
data['bone_cancer'].value_counts()

0    248007
1        85
Name: bone_cancer, dtype: int64

In [23]:
#CNBRAN - Ever had cancer: Brain

In [24]:
data['brain_cancer'] = data.loc[:, 'CNBRAN']

In [25]:
data['brain_cancer'] = data['brain_cancer'].replace([0, 1], 0)
data['brain_cancer'] = data['brain_cancer'].replace(2, 1)

In [26]:
data = data[data['brain_cancer'] != 9]
data = data[data['brain_cancer'] != 8]
data = data[data['brain_cancer'] != 7]

In [27]:
data['brain_cancer'].value_counts()

0    248027
1        65
Name: brain_cancer, dtype: int64

In [28]:
#CNBRES - Ever had cancer: Breast

In [29]:
data['breast_cancer'] = data.loc[:, 'CNBRES']

In [30]:
data['breast_cancer'] = data['breast_cancer'].replace([0, 1], 0)
data['breast_cancer'] = data['breast_cancer'].replace(2, 1)

In [31]:
data = data[data['breast_cancer'] != 9]
data = data[data['breast_cancer'] != 8]
data = data[data['breast_cancer'] != 7]

In [32]:
data['breast_cancer'].value_counts()

0    246251
1      1841
Name: breast_cancer, dtype: int64

In [33]:
#CNCERV - Ever had cancer: Cervix

In [34]:
data['cervix_cancer'] = data.loc[:, 'CNCERV']

In [35]:
data['cervix_cancer'] = data['cervix_cancer'].replace([0, 1], 0)
data['cervix_cancer'] = data['cervix_cancer'].replace(2, 1)

In [36]:
data = data[data['cervix_cancer'] != 9]
data = data[data['cervix_cancer'] != 8]
data = data[data['cervix_cancer'] != 7]

In [37]:
data['cervix_cancer'].value_counts()

0    247561
1       531
Name: cervix_cancer, dtype: int64

In [38]:
#CNCOLN - Ever had cancer: Colon

In [39]:
data['colon_cancer'] = data.loc[:, 'CNCOLN']

In [40]:
data['colon_cancer'] = data['colon_cancer'].replace([0, 1], 0)
data['colon_cancer'] = data['colon_cancer'].replace(2, 1)

In [41]:
data = data[data['colon_cancer'] != 9]
data = data[data['colon_cancer'] != 8]
data = data[data['colon_cancer'] != 7]

In [42]:
data['colon_cancer'].value_counts()

0    247530
1       562
Name: colon_cancer, dtype: int64

In [43]:
#CNESOP - Ever had cancer: Esophagus

In [44]:
data['esophagus_cancer'] = data.loc[:, 'CNESOP']

In [45]:
data['esophagus_cancer'] = data['esophagus_cancer'].replace([0, 1], 0)
data['esophagus_cancer'] = data['esophagus_cancer'].replace(2, 1)

In [46]:
data = data[data['esophagus_cancer'] != 9]
data = data[data['esophagus_cancer'] != 8]
data = data[data['esophagus_cancer'] != 7]

In [47]:
data['esophagus_cancer'].value_counts()

0    248032
1        60
Name: esophagus_cancer, dtype: int64

In [48]:
#CNGALL - Ever had cancer: Gall bladder

In [49]:
data['gallbladder_cancer'] = data.loc[:, 'CNGALL']

In [50]:
data['gallbladder_cancer'] = data['gallbladder_cancer'].replace([0, 1], 0)
data['gallbladder_cancer'] = data['gallbladder_cancer'].replace(2, 1)

In [51]:
data = data[data['gallbladder_cancer'] != 9]
data = data[data['gallbladder_cancer'] != 8]
data = data[data['gallbladder_cancer'] != 7]

In [52]:
data['gallbladder_cancer'].value_counts()

0    248082
1        10
Name: gallbladder_cancer, dtype: int64

In [53]:
#CNKIDN - Ever had cancer: Kidney

In [54]:
data['kidney_cancer'] = data.loc[:, 'CNKIDN']

In [55]:
data['kidney_cancer'] = data['kidney_cancer'].replace([0, 1], 0)
data['kidney_cancer'] = data['kidney_cancer'].replace(2, 1)

In [56]:
data = data[data['kidney_cancer'] != 9]
data = data[data['kidney_cancer'] != 8]
data = data[data['kidney_cancer'] != 7]

In [57]:
data['kidney_cancer'].value_counts()

0    247881
1       211
Name: kidney_cancer, dtype: int64

In [58]:
#CNLARX - Ever had cancer: Larynx-windpipe

In [59]:
data['larynx_cancer'] = data.loc[:, 'CNLARX']

In [60]:
data['larynx_cancer'] = data['larynx_cancer'].replace([0, 1], 0)
data['larynx_cancer'] = data['larynx_cancer'].replace(2, 1)

In [61]:
data = data[data['larynx_cancer'] != 9]
data = data[data['larynx_cancer'] != 8]
data = data[data['larynx_cancer'] != 7]

In [62]:
data['larynx_cancer'].value_counts()

0    248064
1        28
Name: larynx_cancer, dtype: int64

In [63]:
#CNLEUK - Ever had cancer: Leukemia

In [64]:
data['leukemia_cancer'] = data.loc[:, 'CNLEUK']

In [65]:
data['leukemia_cancer'] = data['leukemia_cancer'].replace([0, 1], 0)
data['leukemia_cancer'] = data['leukemia_cancer'].replace(2, 1)

In [66]:
data = data[data['leukemia_cancer'] != 9]
data = data[data['leukemia_cancer'] != 8]
data = data[data['leukemia_cancer'] != 7]

In [67]:
data['leukemia_cancer'].value_counts()

0    247953
1       139
Name: leukemia_cancer, dtype: int64

In [68]:
#CNLIVR - Ever had cancer: Liver

In [69]:
data['liver_cancer'] = data.loc[:, 'CNLIVR']

In [70]:
data['liver_cancer'] = data['liver_cancer'].replace([0, 1], 0)
data['liver_cancer'] = data['liver_cancer'].replace(2, 1)

In [71]:
data = data[data['liver_cancer'] != 9]
data = data[data['liver_cancer'] != 8]
data = data[data['liver_cancer'] != 7]

In [72]:
data['liver_cancer'].value_counts()

0    248025
1        67
Name: liver_cancer, dtype: int64

In [73]:
#CNLUNG - Ever had cancer: Lung

In [248]:
data['lung_cancer'] = data.loc[:, 'CNLUNG']

In [249]:
data['lung_cancer'] = data['lung_cancer'].replace([0, 1], 0)
data['lung_cancer'] = data['lung_cancer'].replace(2, 1)

In [250]:
data = data[data['lung_cancer'] != 9]
data = data[data['lung_cancer'] != 8]
data = data[data['lung_cancer'] != 7]

In [251]:
data['lung_cancer'].value_counts()

0    247774
1       318
Name: lung_cancer, dtype: int64

In [252]:
#CNLYMP - Ever had cancer: Lymphoma

In [253]:
data['Lymphoma_cancer'] = data.loc[:, 'CNLYMP']

In [254]:
data['Lymphoma_cancer'] = data['Lymphoma_cancer'].replace([0, 1], 0)
data['Lymphoma_cancer'] = data['Lymphoma_cancer'].replace(2, 1)

In [255]:
data = data[data['Lymphoma_cancer'] != 9]
data = data[data['Lymphoma_cancer'] != 8]
data = data[data['Lymphoma_cancer'] != 7]

In [256]:
data['Lymphoma_cancer'].value_counts()

0    247787
1       305
Name: Lymphoma_cancer, dtype: int64

In [257]:
#CNMELN - Ever had cancer: Melanoma

In [258]:
data['Melanoma_cancer'] = data.loc[:, 'CNMELN']

In [259]:
data['Melanoma_cancer'] = data['Melanoma_cancer'].replace([0, 1], 0)
data['Melanoma_cancer'] = data['Melanoma_cancer'].replace(2, 1)

In [260]:
data = data[data['Melanoma_cancer'] != 9]
data = data[data['Melanoma_cancer'] != 8]
data = data[data['Melanoma_cancer'] != 7]

In [261]:
data['Melanoma_cancer'].value_counts()

0    247356
1       736
Name: Melanoma_cancer, dtype: int64

In [262]:
#CNMOTH - Ever had cancer: Mouth/tongue/lip

In [263]:
data['Mouth_cancer'] = data.loc[:, 'CNMOTH']

In [264]:
data['Mouth_cancer'] = data['Mouth_cancer'].replace([0, 1], 0)
data['Mouth_cancer'] = data['Mouth_cancer'].replace(2, 1)

In [265]:
data = data[data['Mouth_cancer'] != 9]
data = data[data['Mouth_cancer'] != 8]
data = data[data['Mouth_cancer'] != 7]

In [266]:
data['Mouth_cancer'].value_counts()

0    248051
1        41
Name: Mouth_cancer, dtype: int64

In [267]:
#CNOTHR - Ever had cancer: Other kind

In [268]:
data['Other_cancer'] = data.loc[:, 'CNOTHR']

In [269]:
data['Other_cancer'] = data['Other_cancer'].replace([0, 1], 0)
data['Other_cancer'] = data['Other_cancer'].replace(2, 1)

In [270]:
data = data[data['Other_cancer'] != 9]
data = data[data['Other_cancer'] != 8]
data = data[data['Other_cancer'] != 7]

In [271]:
data['Other_cancer'].value_counts()

0    247578
1       514
Name: Other_cancer, dtype: int64

In [272]:
#CNOVAR - Ever had cancer: Ovarian

In [273]:
data['Ovarian_cancer'] = data.loc[:, 'CNOVAR']

In [274]:
data['Ovarian_cancer'] = data['Ovarian_cancer'].replace([0, 1], 0)
data['Ovarian_cancer'] = data['Ovarian_cancer'].replace(2, 1)

In [275]:
data = data[data['Ovarian_cancer'] != 9]
data = data[data['Ovarian_cancer'] != 8]
data = data[data['Ovarian_cancer'] != 7]

In [276]:
data['Ovarian_cancer'].value_counts()

0    247890
1       202
Name: Ovarian_cancer, dtype: int64

In [277]:
#CNPANC - Ever had cancer: Pancreatic

In [278]:
data['Pancreatic_cancer'] = data.loc[:, 'CNPANC']

In [279]:
data['Pancreatic_cancer'] = data['Pancreatic_cancer'].replace([0, 1], 0)
data['Pancreatic_cancer'] = data['Pancreatic_cancer'].replace(2, 1)

In [280]:
data = data[data['Pancreatic_cancer'] != 9]
data = data[data['Pancreatic_cancer'] != 8]
data = data[data['Pancreatic_cancer'] != 7]

In [281]:
data['Pancreatic_cancer'].value_counts()

0    248040
1        52
Name: Pancreatic_cancer, dtype: int64

In [282]:
#CNPROS - Ever had cancer: Prostate

In [283]:
data['Prostate_cancer'] = data.loc[:, 'CNPROS']

In [284]:
data['Prostate_cancer'] = data['Prostate_cancer'].replace([0, 1], 0)
data['Prostate_cancer'] = data['Prostate_cancer'].replace(2, 1)

In [285]:
data = data[data['Prostate_cancer'] != 9]
data = data[data['Prostate_cancer'] != 8]
data = data[data['Prostate_cancer'] != 7]

In [286]:
data['Prostate_cancer'].value_counts()

0    246942
1      1150
Name: Prostate_cancer, dtype: int64

In [287]:
#CNRECT - Ever had cancer: Rectal

In [288]:
data['Rectal_cancer'] = data.loc[:, 'CNRECT']

In [289]:
data['Rectal_cancer'] = data['Rectal_cancer'].replace([0, 1], 0)
data['Rectal_cancer'] = data['Rectal_cancer'].replace(2, 1)

In [290]:
data = data[data['Rectal_cancer'] != 9]
data = data[data['Rectal_cancer'] != 8]
data = data[data['Rectal_cancer'] != 7]

In [291]:
data['Rectal_cancer'].value_counts()

0    248028
1        64
Name: Rectal_cancer, dtype: int64

In [292]:
#CNSKDK - Ever had cancer: Skin (don't know what kind)

In [293]:
data['Skin_cancer'] = data.loc[:, 'CNSKDK']

In [294]:
data['Skin_cancer'] = data['Skin_cancer'].replace([0, 1], 0)
data['Skin_cancer'] = data['Skin_cancer'].replace(2, 1)

In [295]:
data = data[data['Skin_cancer'] != 9]
data = data[data['Skin_cancer'] != 8]
data = data[data['Skin_cancer'] != 7]

In [296]:
data['Skin_cancer'].value_counts()

0    247191
1       901
Name: Skin_cancer, dtype: int64

In [297]:
#CNSKNM - Ever had cancer: Skin (non-melanoma)

In [298]:
data['Skin1_cancer'] = data.loc[:, 'CNSKNM']

In [299]:
data['Skin1_cancer'] = data['Skin1_cancer'].replace([0, 1], 0)
data['Skin1_cancer'] = data['Skin1_cancer'].replace(2, 1)

In [300]:
data = data[data['Skin1_cancer'] != 9]
data = data[data['Skin1_cancer'] != 8]
data = data[data['Skin1_cancer'] != 7]

In [301]:
data['Skin1_cancer'].value_counts()

0    245989
1      2103
Name: Skin1_cancer, dtype: int64

In [302]:
#CNSOFT - Ever had cancer: Soft tissue (muscle)

In [303]:
data['Skin2_cancer'] = data.loc[:, 'CNSOFT']

In [304]:
data['Skin2_cancer'] = data['Skin2_cancer'].replace([0, 1], 0)
data['Skin2_cancer'] = data['Skin2_cancer'].replace(2, 1)

In [305]:
data = data[data['Skin2_cancer'] != 9]
data = data[data['Skin2_cancer'] != 8]
data = data[data['Skin2_cancer'] != 7]

In [306]:
data['Skin2_cancer'].value_counts()

0    248052
1        40
Name: Skin2_cancer, dtype: int64

In [307]:
#CNSTOM - Ever had cancer: Stomach

In [308]:
data['Stomach_cancer'] = data.loc[:, 'CNSTOM']

In [309]:
data['Stomach_cancer'] = data['Stomach_cancer'].replace([0, 1], 0)
data['Stomach_cancer'] = data['Stomach_cancer'].replace(2, 1)

In [310]:
data = data[data['Stomach_cancer'] != 9]
data = data[data['Stomach_cancer'] != 8]
data = data[data['Stomach_cancer'] != 7]

In [311]:
data['Stomach_cancer'].value_counts()

0    248030
1        62
Name: Stomach_cancer, dtype: int64

In [312]:
#CNTEST - Ever had cancer: Testicular

In [313]:
data['Testicular_cancer'] = data.loc[:, 'CNTEST']

In [314]:
data['Testicular_cancer'] = data['Testicular_cancer'].replace([0, 1], 0)
data['Testicular_cancer'] = data['Testicular_cancer'].replace(2, 1)

In [315]:
data = data[data['Testicular_cancer'] != 9]
data = data[data['Testicular_cancer'] != 8]
data = data[data['Testicular_cancer'] != 7]

In [316]:
data['Testicular_cancer'].value_counts()

0    248014
1        78
Name: Testicular_cancer, dtype: int64

In [317]:
#CNTHRO - Ever had cancer: Throat-pharynx

In [318]:
data['Throat_cancer'] = data.loc[:, 'CNTHRO']

In [319]:
data['Throat_cancer'] = data['Throat_cancer'].replace([0, 1], 0)
data['Throat_cancer'] = data['Throat_cancer'].replace(2, 1)

In [320]:
data = data[data['Throat_cancer'] != 9]
data = data[data['Throat_cancer'] != 8]
data = data[data['Throat_cancer'] != 7]

In [321]:
data['Throat_cancer'].value_counts()

0    248012
1        80
Name: Throat_cancer, dtype: int64

In [322]:
#CNTHYR - Ever had cancer: Thyroid

In [323]:
data['Thyroid_cancer'] = data.loc[:, 'CNTHYR']

In [324]:
data['Thyroid_cancer'] = data['Thyroid_cancer'].replace([0, 1], 0)
data['Thyroid_cancer'] = data['Thyroid_cancer'].replace(2, 1)

In [325]:
data = data[data['Thyroid_cancer'] != 9]
data = data[data['Thyroid_cancer'] != 8]
data = data[data['Thyroid_cancer'] != 7]

In [326]:
data['Thyroid_cancer'].value_counts()

0    247827
1       265
Name: Thyroid_cancer, dtype: int64

In [327]:
#CNUTER - Ever had cancer: Uterine

In [328]:
data['Uterine_cancer'] = data.loc[:, 'CNUTER']

In [329]:
data['Uterine_cancer'] = data['Uterine_cancer'].replace([0, 1], 0)
data['Uterine_cancer'] = data['Uterine_cancer'].replace(2, 1)

In [330]:
data = data[data['Uterine_cancer'] != 9]
data = data[data['Uterine_cancer'] != 8]
data = data[data['Uterine_cancer'] != 7]

In [331]:
data['Uterine_cancer'].value_counts()

0    247705
1       387
Name: Uterine_cancer, dtype: int64

In [332]:
#Now I'm finished turning all of the cancer variables into binary numerical ones
print(data.columns.tolist())

['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'HHWEIGHT', 'LIVINGQTR', 'PERNUM', 'NHISPID', 'HHX', 'FMX', 'PX', 'PERWEIGHT', 'SAMPWEIGHT', 'FWEIGHT', 'SUPP1WT', 'ASTATFLG', 'CSTATFLG', 'AGE', 'SEX', 'SEXORIEN', 'MARST', 'RACENEW', 'NOWAF', 'ARMFCC', 'EDUC', 'EMPSTAT', 'OCC', 'OCC1995', 'IND1995', 'HOURSWRK', 'PAIDSICK', 'CLASSWK2', 'OCCUPN204', 'INDSTRN204', 'POORYN', 'INCFAM07ON', 'WELFMO', 'GOTWELF', 'GOTNEWELF', 'GOTSSIWHY', 'GOTWAGE', 'GOTSEMP', 'STAMPMO', 'FSBALANC', 'FSHUNGRY', 'FSSTATDET', 'OWNERSHIP', 'LOWRENT', 'HEALTH', 'BMI', 'NBHDTRUST', 'DELAYCOST', 'YBARMEDS', 'YBARMENTAL', 'WORMEDBILL', 'YDELAYMEDYR', 'HIPCONAFFORD', 'HCSATIS12M', 'HIUNABLEPAY', 'HINOTCOVE', 'HIPRIVATEE', 'HINOTCOV', 'ADDEV', 'ANGIPECEV', 'ARTHGLUPEV', 'ASTHMAEV', 'AUTISMEV', 'CHEARTDIEV', 'CHOLHIGHEV', 'CNBLAD', 'CNBLOD', 'CNBONE', 'CNBRAN', 'CNBRES', 'CNCERV', 'CNCOLN', 'CNESOP', 'CNGALL', 'CNKIDN', 'CNLARX', 'CNLEUK', 'CNLIVR', 'CNLUNG', 'CNLYMP', 'CNMELN', 'CNMOTH', 'CNOTHR', 'CNOVAR', 'CNPANC', 'CN

In [333]:
#all of my cancer data, turned into binary values of 0 for no cancer, 1 for with cancer
cols = data[['bladder_cancer', 'blood_cancer', 'bone_cancer', 'brain_cancer', 'breast_cancer', 
             'cervix_cancer', 'colon_cancer', 'esophagus_cancer', 'gallbladder_cancer', 
             'kidney_cancer', 'larynx_cancer', 'leukemia_cancer', 'liver_cancer', 'lung_cancer', 
             'Lymphoma_cancer', 'Melanoma_cancer', 'Mouth_cancer', 'Other_cancer', 'Ovarian_cancer', 
             'Pancreatic_cancer', 'Prostate_cancer', 'Rectal_cancer', 'Skin_cancer', 'Skin1_cancer', 
             'Skin2_cancer', 'Stomach_cancer', 'Testicular_cancer', 'Throat_cancer', 'Thyroid_cancer', 
             'Uterine_cancer']]

In [334]:
#I want my target column to be cancer_count. Roughly 3.9% of the population have cancer, this makes my data more balanced
#compared to if I had only used breast cancer which has a value of only 1.7%. It's not ideal but this is what I have to work with.
sum([x if x == 0 else 1 for x in cols.sum(axis=1)]) / data.shape[0]
#cols.sum(axis=1)

0.03891298389307193

In [335]:
#This function makes it so that anyone who tests positive for cancer will be represented by the value 1. 
#Those who test negative for cancer will be represented by the value 0.
def my_list(data):
    if ((data['bladder_cancer'] == 1) or (data['blood_cancer'] == 1) or (data['bone_cancer'] == 1) 
    or (data['brain_cancer'] == 1) or (data['breast_cancer'] == 1) or (data['cervix_cancer'] == 1)
    or (data['colon_cancer'] == 1) or (data['esophagus_cancer'] == 1) or (data['gallbladder_cancer'] == 1)
    or (data['kidney_cancer'] == 1) or (data['larynx_cancer'] == 1) or (data['leukemia_cancer'] == 1)
    or (data['liver_cancer'] == 1) or (data['lung_cancer'] == 1) or (data['Lymphoma_cancer'] == 1)
    or (data['Melanoma_cancer'] == 1) or (data['Mouth_cancer'] == 1) or (data['Other_cancer'] == 1)
    or (data['Ovarian_cancer'] == 1) or (data['Pancreatic_cancer'] == 1) or (data['Prostate_cancer'] == 1)
    or (data['Rectal_cancer'] == 1) or (data['Skin_cancer'] == 1) or (data['Skin1_cancer'] == 1)
    or (data['Skin2_cancer'] == 1) or (data['Stomach_cancer'] == 1) or (data['Testicular_cancer'] == 1)
    or (data['Throat_cancer'] == 1) or (data['Thyroid_cancer'] == 1) or (data['Uterine_cancer'] == 1)):
        return 1
    else:
        return 0

In [336]:
data['cancer_count'] = data.apply(my_list, axis=1)

In [337]:
9654 / 247705

0.03897377929391817

In [338]:
data[['cancer_count']].head()

Unnamed: 0,cancer_count
0,0
1,0
2,0
3,0
4,0


In [339]:
#I'm dropping these cancer binary variables because they are no longer needed. Only want the cancer_count variable 
#data.drop(columns=['bladder_cancer', 'blood_cancer', 'bone_cancer', 'brain_cancer', 'breast_cancer', 
#             'cervix_cancer', 'colon_cancer', 'esophagus_cancer', 'gallbladder_cancer', 
#             'kidney_cancer', 'larynx_cancer', 'leukemia_cancer', 'liver_cancer', 'lung_cancer', 
#             'Lymphoma_cancer', 'Melanoma_cancer', 'Mouth_cancer', 'Other_cancer', 'Ovarian_cancer', 
#             'Pancreatic_cancer', 'Prostate_cancer', 'Rectal_cancer', 'Skin_cancer', 'Skin1_cancer', 
#             'Skin2_cancer', 'Stomach_cancer', 'Testicular_cancer', 'Throat_cancer', 'Thyroid_cancer', 
#             'Uterine_cancer'], inplace=True)

In [340]:
#Here is my data with the column cancer_count which is my y-value.
#Cancer_count variable is the last column on the right side
data.head()

Unnamed: 0,YEAR,SERIAL,STRATA,PSU,NHISHID,HHWEIGHT,LIVINGQTR,PERNUM,NHISPID,HHX,...,Rectal_cancer,Skin_cancer,Skin1_cancer,Skin2_cancer,Stomach_cancer,Testicular_cancer,Throat_cancer,Thyroid_cancer,Uterine_cancer,cancer_count
0,2016,1,7131,20,2016000001,5621,21,1,20160000010101,1,...,0,0,0,0,0,0,0,0,0,0
1,2016,1,7131,20,2016000001,5621,21,2,20160000010102,1,...,0,0,0,0,0,0,0,0,0,0
2,2016,1,7131,20,2016000001,5621,21,3,20160000010201,1,...,0,0,0,0,0,0,0,0,0,0
3,2016,2,7149,2,2016000002,1752,21,1,20160000020101,2,...,0,0,0,0,0,0,0,0,0,0
4,2016,3,7129,2,2016000003,4442,21,1,20160000030101,3,...,0,0,0,0,0,0,0,0,0,0


In [341]:
#My data is exported as cancer_data.csv
data.to_csv('cancer_data.csv') 