In [2]:
#Executive Summary

#The goal of this project is to use classification models to predict the probability of a person testing positive for cancer
#based on 20 or so socioeconomic and health variables. My data was taken from IPUMS which provides census and survey data
#IPUMS is a massive database that collects American and international census records and surveys. Their data includes health,
#population, housing, education, and historical data. My metrics are confusion matrix, sensitivity, and accuracy. 

In [3]:
#Here is my presentation
#https://docs.google.com/presentation/d/1PAKrvbSBUsXfa_DXmC4J_ZrADge-Ve3rQv6icB3FCCM/edit#slide=id.gfff6384ca7_1_22

In [4]:
import pandas as pd

In [5]:
#Originally, I only wanted to include breast cancer as my binary target value. However, I realized my data is extreamely unbalanced.
#Only about 1.5 percent of respondents have cancer while 98.5 percent do not. Therefore, I decided to make my y-value be a binary
#value where those with any kind of cancer would be 1, while those without cancer would be 0.

#I imported my original data nhis_00013.csv

In [6]:
data = pd.read_csv('./Data_Files/nhis_00013.csv.gz')

In [6]:
#I have about 112 columns, I need to delete most of them and find the relevant ones using Tableau visualization as my feature selection.
#Classification values are not suited for correlation
data.columns

Index(['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'HHWEIGHT', 'LIVINGQTR',
       'PERNUM', 'NHISPID', 'HHX',
       ...
       'ALCAMT', 'CIGDAYMO', 'SMOKFREQNOW', 'STRONGFWK', 'HRSLEEP',
       'WORFEELEVL', 'DEPFREQ', 'UNHAPPY', 'MORTELIG', 'MORTWTSA'],
      dtype='object', length=112)

In [7]:
#CNBLAD - Ever had cancer: Bladder

In [8]:
data.shape

(248132, 112)

In [9]:
#For the rest of the cells, those who test positive for cancer will be represented by 1, while those testing negative for cancer will be 0.

In [10]:
data['bladder_cancer'] = data.loc[:, 'CNBLAD']

In [11]:
data['bladder_cancer'] = data['bladder_cancer'].replace([0, 1], 0)

In [12]:
data['bladder_cancer'] = data['bladder_cancer'].replace(2, 1)

In [13]:
data = data[data['bladder_cancer'] != 9]
data = data[data['bladder_cancer'] != 8]
data = data[data['bladder_cancer'] != 7]

In [14]:
data['bladder_cancer'].value_counts()

0    247836
1       256
Name: bladder_cancer, dtype: int64

In [15]:
#CNBLOD - Ever had cancer: Blood

In [16]:
data['blood_cancer'] = data.loc[:, 'CNBLOD']

In [17]:
data['blood_cancer'] = data['blood_cancer'].replace([0, 1], 0)
data['blood_cancer'] = data['blood_cancer'].replace(2, 1)

In [18]:
data = data[data['blood_cancer'] != 9]
data = data[data['blood_cancer'] != 8]
data = data[data['blood_cancer'] != 7]

In [19]:
data['blood_cancer'].value_counts()

0    248035
1        57
Name: blood_cancer, dtype: int64

In [20]:
#CNBONE - Ever had cancer: Bone

In [21]:
data['bone_cancer'] = data.loc[:, 'CNBONE']

In [22]:
data['bone_cancer'] = data['bone_cancer'].replace([0, 1], 0)
data['bone_cancer'] = data['bone_cancer'].replace(2, 1)

In [23]:
data = data[data['bone_cancer'] != 9]
data = data[data['bone_cancer'] != 8]
data = data[data['bone_cancer'] != 7]

In [24]:
data['bone_cancer'].value_counts()

0    248007
1        85
Name: bone_cancer, dtype: int64

In [25]:
#CNBRAN - Ever had cancer: Brain

In [26]:
data['brain_cancer'] = data.loc[:, 'CNBRAN']

In [27]:
data['brain_cancer'] = data['brain_cancer'].replace([0, 1], 0)
data['brain_cancer'] = data['brain_cancer'].replace(2, 1)

In [28]:
data = data[data['brain_cancer'] != 9]
data = data[data['brain_cancer'] != 8]
data = data[data['brain_cancer'] != 7]

In [29]:
data['brain_cancer'].value_counts()

0    248027
1        65
Name: brain_cancer, dtype: int64

In [30]:
#CNBRES - Ever had cancer: Breast

In [31]:
data['breast_cancer'] = data.loc[:, 'CNBRES']

In [32]:
data['breast_cancer'] = data['breast_cancer'].replace([0, 1], 0)
data['breast_cancer'] = data['breast_cancer'].replace(2, 1)

In [33]:
data = data[data['breast_cancer'] != 9]
data = data[data['breast_cancer'] != 8]
data = data[data['breast_cancer'] != 7]

In [34]:
data['breast_cancer'].value_counts()

0    246251
1      1841
Name: breast_cancer, dtype: int64

In [35]:
#CNCERV - Ever had cancer: Cervix

In [36]:
data['cervix_cancer'] = data.loc[:, 'CNCERV']

In [37]:
data['cervix_cancer'] = data['cervix_cancer'].replace([0, 1], 0)
data['cervix_cancer'] = data['cervix_cancer'].replace(2, 1)

In [38]:
data = data[data['cervix_cancer'] != 9]
data = data[data['cervix_cancer'] != 8]
data = data[data['cervix_cancer'] != 7]

In [39]:
data['cervix_cancer'].value_counts()

0    247561
1       531
Name: cervix_cancer, dtype: int64

In [40]:
#CNCOLN - Ever had cancer: Colon

In [41]:
data['colon_cancer'] = data.loc[:, 'CNCOLN']

In [42]:
data['colon_cancer'] = data['colon_cancer'].replace([0, 1], 0)
data['colon_cancer'] = data['colon_cancer'].replace(2, 1)

In [43]:
data = data[data['colon_cancer'] != 9]
data = data[data['colon_cancer'] != 8]
data = data[data['colon_cancer'] != 7]

In [44]:
data['colon_cancer'].value_counts()

0    247530
1       562
Name: colon_cancer, dtype: int64

In [45]:
#CNESOP - Ever had cancer: Esophagus

In [46]:
data['esophagus_cancer'] = data.loc[:, 'CNESOP']

In [47]:
data['esophagus_cancer'] = data['esophagus_cancer'].replace([0, 1], 0)
data['esophagus_cancer'] = data['esophagus_cancer'].replace(2, 1)

In [48]:
data = data[data['esophagus_cancer'] != 9]
data = data[data['esophagus_cancer'] != 8]
data = data[data['esophagus_cancer'] != 7]

In [49]:
data['esophagus_cancer'].value_counts()

0    248032
1        60
Name: esophagus_cancer, dtype: int64

In [50]:
#CNGALL - Ever had cancer: Gall bladder

In [51]:
data['gallbladder_cancer'] = data.loc[:, 'CNGALL']

In [52]:
data['gallbladder_cancer'] = data['gallbladder_cancer'].replace([0, 1], 0)
data['gallbladder_cancer'] = data['gallbladder_cancer'].replace(2, 1)

In [53]:
data = data[data['gallbladder_cancer'] != 9]
data = data[data['gallbladder_cancer'] != 8]
data = data[data['gallbladder_cancer'] != 7]

In [54]:
data['gallbladder_cancer'].value_counts()

0    248082
1        10
Name: gallbladder_cancer, dtype: int64

In [55]:
#CNKIDN - Ever had cancer: Kidney

In [56]:
data['kidney_cancer'] = data.loc[:, 'CNKIDN']

In [57]:
data['kidney_cancer'] = data['kidney_cancer'].replace([0, 1], 0)
data['kidney_cancer'] = data['kidney_cancer'].replace(2, 1)

In [58]:
data = data[data['kidney_cancer'] != 9]
data = data[data['kidney_cancer'] != 8]
data = data[data['kidney_cancer'] != 7]

In [59]:
data['kidney_cancer'].value_counts()

0    247881
1       211
Name: kidney_cancer, dtype: int64

In [60]:
#CNLARX - Ever had cancer: Larynx-windpipe

In [61]:
data['larynx_cancer'] = data.loc[:, 'CNLARX']

In [62]:
data['larynx_cancer'] = data['larynx_cancer'].replace([0, 1], 0)
data['larynx_cancer'] = data['larynx_cancer'].replace(2, 1)

In [63]:
data = data[data['larynx_cancer'] != 9]
data = data[data['larynx_cancer'] != 8]
data = data[data['larynx_cancer'] != 7]

In [64]:
data['larynx_cancer'].value_counts()

0    248064
1        28
Name: larynx_cancer, dtype: int64

In [65]:
#CNLEUK - Ever had cancer: Leukemia

In [66]:
data['leukemia_cancer'] = data.loc[:, 'CNLEUK']

In [67]:
data['leukemia_cancer'] = data['leukemia_cancer'].replace([0, 1], 0)
data['leukemia_cancer'] = data['leukemia_cancer'].replace(2, 1)

In [68]:
data = data[data['leukemia_cancer'] != 9]
data = data[data['leukemia_cancer'] != 8]
data = data[data['leukemia_cancer'] != 7]

In [69]:
data['leukemia_cancer'].value_counts()

0    247953
1       139
Name: leukemia_cancer, dtype: int64

In [70]:
#CNLIVR - Ever had cancer: Liver

In [71]:
data['liver_cancer'] = data.loc[:, 'CNLIVR']

In [72]:
data['liver_cancer'] = data['liver_cancer'].replace([0, 1], 0)
data['liver_cancer'] = data['liver_cancer'].replace(2, 1)

In [73]:
data = data[data['liver_cancer'] != 9]
data = data[data['liver_cancer'] != 8]
data = data[data['liver_cancer'] != 7]

In [74]:
data['liver_cancer'].value_counts()

0    248025
1        67
Name: liver_cancer, dtype: int64

In [75]:
#CNLUNG - Ever had cancer: Lung

In [76]:
data['lung_cancer'] = data.loc[:, 'CNLUNG']

In [77]:
data['lung_cancer'] = data['lung_cancer'].replace([0, 1], 0)
data['lung_cancer'] = data['lung_cancer'].replace(2, 1)

In [78]:
data = data[data['lung_cancer'] != 9]
data = data[data['lung_cancer'] != 8]
data = data[data['lung_cancer'] != 7]

In [79]:
data['lung_cancer'].value_counts()

0    247774
1       318
Name: lung_cancer, dtype: int64

In [80]:
#CNLYMP - Ever had cancer: Lymphoma

In [81]:
data['Lymphoma_cancer'] = data.loc[:, 'CNLYMP']

In [82]:
data['Lymphoma_cancer'] = data['Lymphoma_cancer'].replace([0, 1], 0)
data['Lymphoma_cancer'] = data['Lymphoma_cancer'].replace(2, 1)

In [83]:
data = data[data['Lymphoma_cancer'] != 9]
data = data[data['Lymphoma_cancer'] != 8]
data = data[data['Lymphoma_cancer'] != 7]

In [84]:
data['Lymphoma_cancer'].value_counts()

0    247787
1       305
Name: Lymphoma_cancer, dtype: int64

In [85]:
#CNMELN - Ever had cancer: Melanoma

In [86]:
data['Melanoma_cancer'] = data.loc[:, 'CNMELN']

In [87]:
data['Melanoma_cancer'] = data['Melanoma_cancer'].replace([0, 1], 0)
data['Melanoma_cancer'] = data['Melanoma_cancer'].replace(2, 1)

In [88]:
data = data[data['Melanoma_cancer'] != 9]
data = data[data['Melanoma_cancer'] != 8]
data = data[data['Melanoma_cancer'] != 7]

In [89]:
data['Melanoma_cancer'].value_counts()

0    247356
1       736
Name: Melanoma_cancer, dtype: int64

In [90]:
#CNMOTH - Ever had cancer: Mouth/tongue/lip

In [91]:
data['Mouth_cancer'] = data.loc[:, 'CNMOTH']

In [92]:
data['Mouth_cancer'] = data['Mouth_cancer'].replace([0, 1], 0)
data['Mouth_cancer'] = data['Mouth_cancer'].replace(2, 1)

In [93]:
data = data[data['Mouth_cancer'] != 9]
data = data[data['Mouth_cancer'] != 8]
data = data[data['Mouth_cancer'] != 7]

In [94]:
data['Mouth_cancer'].value_counts()

0    248051
1        41
Name: Mouth_cancer, dtype: int64

In [95]:
#CNOTHR - Ever had cancer: Other kind

In [96]:
data['Other_cancer'] = data.loc[:, 'CNOTHR']

In [97]:
data['Other_cancer'] = data['Other_cancer'].replace([0, 1], 0)
data['Other_cancer'] = data['Other_cancer'].replace(2, 1)

In [98]:
data = data[data['Other_cancer'] != 9]
data = data[data['Other_cancer'] != 8]
data = data[data['Other_cancer'] != 7]

In [99]:
data['Other_cancer'].value_counts()

0    247578
1       514
Name: Other_cancer, dtype: int64

In [100]:
#CNOVAR - Ever had cancer: Ovarian

In [101]:
data['Ovarian_cancer'] = data.loc[:, 'CNOVAR']

In [102]:
data['Ovarian_cancer'] = data['Ovarian_cancer'].replace([0, 1], 0)
data['Ovarian_cancer'] = data['Ovarian_cancer'].replace(2, 1)

In [103]:
data = data[data['Ovarian_cancer'] != 9]
data = data[data['Ovarian_cancer'] != 8]
data = data[data['Ovarian_cancer'] != 7]

In [104]:
data['Ovarian_cancer'].value_counts()

0    247890
1       202
Name: Ovarian_cancer, dtype: int64

In [105]:
#CNPANC - Ever had cancer: Pancreatic

In [106]:
data['Pancreatic_cancer'] = data.loc[:, 'CNPANC']

In [107]:
data['Pancreatic_cancer'] = data['Pancreatic_cancer'].replace([0, 1], 0)
data['Pancreatic_cancer'] = data['Pancreatic_cancer'].replace(2, 1)

In [108]:
data = data[data['Pancreatic_cancer'] != 9]
data = data[data['Pancreatic_cancer'] != 8]
data = data[data['Pancreatic_cancer'] != 7]

In [109]:
data['Pancreatic_cancer'].value_counts()

0    248040
1        52
Name: Pancreatic_cancer, dtype: int64

In [110]:
#CNPROS - Ever had cancer: Prostate

In [111]:
data['Prostate_cancer'] = data.loc[:, 'CNPROS']

In [112]:
data['Prostate_cancer'] = data['Prostate_cancer'].replace([0, 1], 0)
data['Prostate_cancer'] = data['Prostate_cancer'].replace(2, 1)

In [113]:
data = data[data['Prostate_cancer'] != 9]
data = data[data['Prostate_cancer'] != 8]
data = data[data['Prostate_cancer'] != 7]

In [114]:
data['Prostate_cancer'].value_counts()

0    246942
1      1150
Name: Prostate_cancer, dtype: int64

In [115]:
#CNRECT - Ever had cancer: Rectal

In [116]:
data['Rectal_cancer'] = data.loc[:, 'CNRECT']

In [117]:
data['Rectal_cancer'] = data['Rectal_cancer'].replace([0, 1], 0)
data['Rectal_cancer'] = data['Rectal_cancer'].replace(2, 1)

In [118]:
data = data[data['Rectal_cancer'] != 9]
data = data[data['Rectal_cancer'] != 8]
data = data[data['Rectal_cancer'] != 7]

In [119]:
data['Rectal_cancer'].value_counts()

0    248028
1        64
Name: Rectal_cancer, dtype: int64

In [120]:
#CNSKDK - Ever had cancer: Skin (don't know what kind)

In [121]:
data['Skin_cancer'] = data.loc[:, 'CNSKDK']

In [122]:
data['Skin_cancer'] = data['Skin_cancer'].replace([0, 1], 0)
data['Skin_cancer'] = data['Skin_cancer'].replace(2, 1)

In [123]:
data = data[data['Skin_cancer'] != 9]
data = data[data['Skin_cancer'] != 8]
data = data[data['Skin_cancer'] != 7]

In [124]:
data['Skin_cancer'].value_counts()

0    247191
1       901
Name: Skin_cancer, dtype: int64

In [125]:
#CNSKNM - Ever had cancer: Skin (non-melanoma)

In [126]:
data['Skin1_cancer'] = data.loc[:, 'CNSKNM']

In [127]:
data['Skin1_cancer'] = data['Skin1_cancer'].replace([0, 1], 0)
data['Skin1_cancer'] = data['Skin1_cancer'].replace(2, 1)

In [128]:
data = data[data['Skin1_cancer'] != 9]
data = data[data['Skin1_cancer'] != 8]
data = data[data['Skin1_cancer'] != 7]

In [129]:
data['Skin1_cancer'].value_counts()

0    245989
1      2103
Name: Skin1_cancer, dtype: int64

In [130]:
#CNSOFT - Ever had cancer: Soft tissue (muscle)

In [131]:
data['Skin2_cancer'] = data.loc[:, 'CNSOFT']

In [132]:
data['Skin2_cancer'] = data['Skin2_cancer'].replace([0, 1], 0)
data['Skin2_cancer'] = data['Skin2_cancer'].replace(2, 1)

In [133]:
data = data[data['Skin2_cancer'] != 9]
data = data[data['Skin2_cancer'] != 8]
data = data[data['Skin2_cancer'] != 7]

In [134]:
data['Skin2_cancer'].value_counts()

0    248052
1        40
Name: Skin2_cancer, dtype: int64

In [135]:
#CNSTOM - Ever had cancer: Stomach

In [136]:
data['Stomach_cancer'] = data.loc[:, 'CNSTOM']

In [137]:
data['Stomach_cancer'] = data['Stomach_cancer'].replace([0, 1], 0)
data['Stomach_cancer'] = data['Stomach_cancer'].replace(2, 1)

In [138]:
data = data[data['Stomach_cancer'] != 9]
data = data[data['Stomach_cancer'] != 8]
data = data[data['Stomach_cancer'] != 7]

In [139]:
data['Stomach_cancer'].value_counts()

0    248030
1        62
Name: Stomach_cancer, dtype: int64

In [140]:
#CNTEST - Ever had cancer: Testicular

In [141]:
data['Testicular_cancer'] = data.loc[:, 'CNTEST']

In [142]:
data['Testicular_cancer'] = data['Testicular_cancer'].replace([0, 1], 0)
data['Testicular_cancer'] = data['Testicular_cancer'].replace(2, 1)

In [143]:
data = data[data['Testicular_cancer'] != 9]
data = data[data['Testicular_cancer'] != 8]
data = data[data['Testicular_cancer'] != 7]

In [144]:
data['Testicular_cancer'].value_counts()

0    248014
1        78
Name: Testicular_cancer, dtype: int64

In [145]:
#CNTHRO - Ever had cancer: Throat-pharynx

In [146]:
data['Throat_cancer'] = data.loc[:, 'CNTHRO']

In [147]:
data['Throat_cancer'] = data['Throat_cancer'].replace([0, 1], 0)
data['Throat_cancer'] = data['Throat_cancer'].replace(2, 1)

In [148]:
data = data[data['Throat_cancer'] != 9]
data = data[data['Throat_cancer'] != 8]
data = data[data['Throat_cancer'] != 7]

In [149]:
data['Throat_cancer'].value_counts()

0    248012
1        80
Name: Throat_cancer, dtype: int64

In [150]:
#CNTHYR - Ever had cancer: Thyroid

In [151]:
data['Thyroid_cancer'] = data.loc[:, 'CNTHYR']

In [152]:
data['Thyroid_cancer'] = data['Thyroid_cancer'].replace([0, 1], 0)
data['Thyroid_cancer'] = data['Thyroid_cancer'].replace(2, 1)

In [153]:
data = data[data['Thyroid_cancer'] != 9]
data = data[data['Thyroid_cancer'] != 8]
data = data[data['Thyroid_cancer'] != 7]

In [154]:
data['Thyroid_cancer'].value_counts()

0    247827
1       265
Name: Thyroid_cancer, dtype: int64

In [155]:
#CNUTER - Ever had cancer: Uterine

In [156]:
data['Uterine_cancer'] = data.loc[:, 'CNUTER']

In [157]:
data['Uterine_cancer'] = data['Uterine_cancer'].replace([0, 1], 0)
data['Uterine_cancer'] = data['Uterine_cancer'].replace(2, 1)

In [None]:
data = data[data['Uterine_cancer'] != 9]
data = data[data['Uterine_cancer'] != 8]
data = data[data['Uterine_cancer'] != 7]

In [None]:
data['Uterine_cancer'].value_counts()

0    247705
1       387
Name: Uterine_cancer, dtype: int64

In [None]:
#Now I'm finished turning all of the cancer variables into binary numerical ones
print(data.columns.tolist())

['YEAR', 'SERIAL', 'STRATA', 'PSU', 'NHISHID', 'HHWEIGHT', 'LIVINGQTR', 'PERNUM', 'NHISPID', 'HHX', 'FMX', 'PX', 'PERWEIGHT', 'SAMPWEIGHT', 'FWEIGHT', 'SUPP1WT', 'ASTATFLG', 'CSTATFLG', 'AGE', 'SEX', 'SEXORIEN', 'MARST', 'RACENEW', 'NOWAF', 'ARMFCC', 'EDUC', 'EMPSTAT', 'OCC', 'OCC1995', 'IND1995', 'HOURSWRK', 'PAIDSICK', 'CLASSWK2', 'OCCUPN204', 'INDSTRN204', 'POORYN', 'INCFAM07ON', 'WELFMO', 'GOTWELF', 'GOTNEWELF', 'GOTSSIWHY', 'GOTWAGE', 'GOTSEMP', 'STAMPMO', 'FSBALANC', 'FSHUNGRY', 'FSSTATDET', 'OWNERSHIP', 'LOWRENT', 'HEALTH', 'BMI', 'NBHDTRUST', 'DELAYCOST', 'YBARMEDS', 'YBARMENTAL', 'WORMEDBILL', 'YDELAYMEDYR', 'HIPCONAFFORD', 'HCSATIS12M', 'HIUNABLEPAY', 'HINOTCOVE', 'HIPRIVATEE', 'HINOTCOV', 'ADDEV', 'ANGIPECEV', 'ARTHGLUPEV', 'ASTHMAEV', 'AUTISMEV', 'CHEARTDIEV', 'CHOLHIGHEV', 'CNBLAD', 'CNBLOD', 'CNBONE', 'CNBRAN', 'CNBRES', 'CNCERV', 'CNCOLN', 'CNESOP', 'CNGALL', 'CNKIDN', 'CNLARX', 'CNLEUK', 'CNLIVR', 'CNLUNG', 'CNLYMP', 'CNMELN', 'CNMOTH', 'CNOTHR', 'CNOVAR', 'CNPANC', 'CN

In [None]:
#all of my cancer data, turned into binary values of 0 for no cancer, 1 for with cancer
cols = data[['bladder_cancer', 'blood_cancer', 'bone_cancer', 'brain_cancer', 'breast_cancer', 
             'cervix_cancer', 'colon_cancer', 'esophagus_cancer', 'gallbladder_cancer', 
             'kidney_cancer', 'larynx_cancer', 'leukemia_cancer', 'liver_cancer', 'lung_cancer', 
             'Lymphoma_cancer', 'Melanoma_cancer', 'Mouth_cancer', 'Other_cancer', 'Ovarian_cancer', 
             'Pancreatic_cancer', 'Prostate_cancer', 'Rectal_cancer', 'Skin_cancer', 'Skin1_cancer', 
             'Skin2_cancer', 'Stomach_cancer', 'Testicular_cancer', 'Throat_cancer', 'Thyroid_cancer', 
             'Uterine_cancer']]

In [None]:
#I want my target column to be cancer_count. Roughly 3.9% of the population have cancer, this makes my data more balanced
#compared to if I had only used breast cancer which has a value of only 1.7%. It's not ideal but this is what I have to work with.
sum([x if x == 0 else 1 for x in cols.sum(axis=1)]) / data.shape[0]
#cols.sum(axis=1)

0.03891298389307193

In [170]:
#This function makes it so that anyone who tests positive for cancer will be represented by the value 1. 
#Those who test negative for cancer will be represented by the value 0.
def my_list(data):
    if ((data['bladder_cancer'] == 1) or (data['blood_cancer'] == 1) or (data['bone_cancer'] == 1) 
    or (data['brain_cancer'] == 1) or (data['breast_cancer'] == 1) or (data['cervix_cancer'] == 1)
    or (data['colon_cancer'] == 1) or (data['esophagus_cancer'] == 1) or (data['gallbladder_cancer'] == 1)
    or (data['kidney_cancer'] == 1) or (data['larynx_cancer'] == 1) or (data['leukemia_cancer'] == 1)
    or (data['liver_cancer'] == 1) or (data['lung_cancer'] == 1) or (data['Lymphoma_cancer'] == 1)
    or (data['Melanoma_cancer'] == 1) or (data['Mouth_cancer'] == 1) or (data['Other_cancer'] == 1)
    or (data['Ovarian_cancer'] == 1) or (data['Pancreatic_cancer'] == 1) or (data['Prostate_cancer'] == 1)
    or (data['Rectal_cancer'] == 1) or (data['Skin_cancer'] == 1) or (data['Skin1_cancer'] == 1)
    or (data['Skin2_cancer'] == 1) or (data['Stomach_cancer'] == 1) or (data['Testicular_cancer'] == 1)
    or (data['Throat_cancer'] == 1) or (data['Thyroid_cancer'] == 1) or (data['Uterine_cancer'] == 1)):
        return 1
    else:
        return 0

In [171]:
data['cancer_count'] = data.apply(my_list, axis=1)

KeyError: 'bladder_cancer'

In [None]:
9654 / 247705

In [None]:
data[['cancer_count']].head()

In [None]:
#I'm dropping these cancer binary variables because they are no longer needed. Only want the cancer_count variable 
data.drop(columns=['bladder_cancer', 'blood_cancer', 'bone_cancer', 'brain_cancer', 'breast_cancer', 
             'cervix_cancer', 'colon_cancer', 'esophagus_cancer', 'gallbladder_cancer', 
             'kidney_cancer', 'larynx_cancer', 'leukemia_cancer', 'liver_cancer', 'lung_cancer', 
             'Lymphoma_cancer', 'Melanoma_cancer', 'Mouth_cancer', 'Other_cancer', 'Ovarian_cancer', 
             'Pancreatic_cancer', 'Prostate_cancer', 'Rectal_cancer', 'Skin_cancer', 'Skin1_cancer', 
             'Skin2_cancer', 'Stomach_cancer', 'Testicular_cancer', 'Throat_cancer', 'Thyroid_cancer', 
             'Uterine_cancer'], inplace=True)

In [None]:
#Here is my data with the column cancer_count which is my y-value.
#Cancer_count variable is the last column on the right side
data.head()

In [None]:
#My data is exported as cancer_data.csv
data.to_csv('cancer_data.csv') 