In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.io as pio
import plotly.express as px
from IPython.display import Image
import seaborn as sns
import matplotlib.pyplot as plt

## Some general information on our Data

In [None]:
x_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_train.csv"))
x_test = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_test.csv"))
y_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/y_train.csv"))
print('The shape of x_train is', x_train.shape)
print('The shape of y_train is', y_train.shape)
print('The shape of x_test is', x_test.shape)

The shape of x_train is (328135, 321)
The shape of y_train is (328135, 1)
The shape of x_test is (109379, 321)


In [None]:
labels = y_train["_MICHD"]
print("The only values that appear in the labels are", np.unique(labels), ".") # only 0 or 1 appear in the labels, as expected
print("y_train has", labels.isna().sum(), "missing values.") # y_train doesn't have missing values
print("The percentage of positive labels is", sum(labels) / len(labels) * 100, "%.")

The only values that appear in the labels are [0 1] .
y_train has 0 missing values.
The percentage of positive labels is 8.830207079403294 %.


## Analysis of some "interesting" columns

Some interesting columns could be the following (taken from Kaggle): 'DIABETE3', 'BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3', 'CVDINFR4', 'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK', '_AGE_G', 'EDUCA', 'INCOME2'.


In [121]:
df = x_train.assign(label = y_train) # adds labels as last column of x_train to have complete training data
df.shape # 322 columns as expected
l = df.columns
print(l)

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM',
       ...
       '_PASTAE1', '_LMTACT1', '_LMTWRK1', '_LMTSCL1', '_RFSEAT2', '_RFSEAT3',
       '_FLSHOT6', '_PNEUMO2', '_AIDTST3', 'label'],
      dtype='object', length=322)


In [129]:
df_subset = df[['DIABETE3','BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3',
                'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK',
                '_AGE_G', 'EDUCA', 'INCOME2', 'label']]
df_subset.shape
df_subset.isna().sum()

DIABETE3        5
BPHIGH4         1
BLOODCHO        0
_BMI5       27073
CVDSTRK3        0
EXERANY2    26205
GENHLTH         2
PHYSHLTH        1
DIFFWALK     9149
_AGE_G          0
EDUCA           0
INCOME2      2471
label           0
dtype: int64

In [None]:
df_subset = df_subset.dropna() # drop rows with NaN values
print(df_subset.shape) # we still have a lot of rows

(282617, 13)


### DIABETE3
Ever told you had diabetes?
For this column, we see that among those who had diabetes, 22% had a heart attack but among those who didn't, only 6% had a heart attack, suggesting diabetes could be a useful predictor.

In [None]:
print("The values in the DIABETE3 column are:", np.unique(df_subset["DIABETE3"]), ".") # we should just keep 1 = YES and 3 = NO
diabete = df_subset["DIABETE3"]                                                        # set everything else to 9 and drop in the future
diabete[diabete > 3] = 9
diabete[diabete == 3] = 0
diabete[diabete == 2] = 9
pd.crosstab(diabete, df_subset["label"], normalize = "index")

The values in the DIABETE3 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 2] = 9


label,0,1
DIABETE3,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.930487,0.069513
1.0,0.773996,0.226004
9.0,0.88366,0.11634


### BPHIGH4
Ever told by a doctor you had high blood pressure? For this column as well, we see that among those who had diabetes, 17% had a heart attack but among those who didn't, only 4% had a heart attack, suggesting high blood pressure could be a useful predictor.

In [None]:
print("The values in the HIGHBP4 column are:", np.unique(df_subset["BPHIGH4"]), ".") # we should just keep 1 = YES and 3 = NO
highbp = df_subset["BPHIGH4"]                                                        # set everything else to 9 and drop in the future
highbp[highbp > 3] = 9
highbp[highbp == 3] = 0
highbp[highbp == 2] = 9
pd.crosstab(highbp, df_subset["label"], normalize = "index")

The values in the HIGHBP4 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 2] = 9


label,0,1
BPHIGH4,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.961325,0.038675
1.0,0.833328,0.166672
9.0,0.941741,0.058259


### _BMI5
Body Mass Index. This column takes values in the positive reals. Is there a correlation between BMI (high fat percentage) and heart attacks?
It seems the correlation is somewhat weak as 8% of non obese people had a heart attack and 11% of obese people had one.

In [130]:
bmi = df_subset["_BMI5"]
print(bmi.describe()) # 75% percentile at 31, max at 98
bmi[bmi <= 30] = 0 # bmi 30 is the threshold for obesity.
bmi[bmi > 30] = 1
pd.crosstab(bmi, df_subset["label"], normalize = "index")

count    301062.000000
mean         28.033007
std           6.670068
min          12.020000
25%          23.730000
50%          26.930000
75%          30.900000
max          97.650000
Name: _BMI5, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi[bmi <= 30] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi[bmi > 30] = 1


label,0,1
_BMI5,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.919001,0.080999
1.0,0.885863,0.114137


### CVDSTRK3
Ever told you had a stroke? Seems correlation is quite high (makes sense), table below.

In [131]:
print("The values in the CVDSTRK3 column are:", np.unique(df_subset["CVDSTRK3"]), ".") # we should just keep 1 = YES and 2 = NO
stroke = df_subset["CVDSTRK3"]                                                        # set everything else to 9 and drop in the future
stroke[stroke > 2] = 9
stroke[stroke == 2] = 0
pd.crosstab(stroke, df_subset["label"], normalize = "index")

The values in the CVDSTRK3 column are: [1 2 7 9] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke[stroke > 2] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke[stroke == 2] = 0


label,0,1
CVDSTRK3,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.924282,0.075718
1,0.628535,0.371465
9,0.68521,0.31479
