In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import plotly
import plotly.io as pio
import plotly.express as px
from IPython.display import Image
import seaborn as sns
import matplotlib.pyplot as plt

## Some general information on our Data

In [3]:
x_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_train.csv"))
x_test = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_test.csv"))
y_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/y_train.csv"))
print('The shape of x_train is', x_train.shape)
print('The shape of y_train is', y_train.shape)
print('The shape of x_test is', x_test.shape)

The shape of x_train is (328135, 321)
The shape of y_train is (328135, 1)
The shape of x_test is (109379, 321)


In [4]:
labels = y_train["_MICHD"]
print("The only values that appear in the labels are", np.unique(labels), ".") # only 0 or 1 appear in the labels, as expected
print("y_train has", labels.isna().sum(), "missing values.") # y_train doesn't have missing values
print("The percentage of positive labels is", sum(labels) / len(labels) * 100, "%.")

The only values that appear in the labels are [0 1] .
y_train has 0 missing values.
The percentage of positive labels is 8.830207079403294 %.


## Analysis of some "interesting" columns

Some interesting columns could be the following (taken from Kaggle): 'DIABETE3', 'BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3', 'CVDINFR4', 'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK', '_AGE_G', 'EDUCA', 'INCOME2'.


In [57]:
df = x_train.assign(label = y_train) # adds labels as last column of x_train to have complete training data
df.shape # 322 columns as expected
l = df.columns
print(l)

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM',
       ...
       '_PASTAE1', '_LMTACT1', '_LMTWRK1', '_LMTSCL1', '_RFSEAT2', '_RFSEAT3',
       '_FLSHOT6', '_PNEUMO2', '_AIDTST3', 'label'],
      dtype='object', length=322)


In [58]:
df_subset = df[['DIABETE3','BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3',
                'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK',
                '_AGE_G', 'EDUCA', 'INCOME2', 'label']]
df_subset.shape
df_subset.isna().sum()

DIABETE3        5
BPHIGH4         1
BLOODCHO        0
_BMI5       27073
CVDSTRK3        0
EXERANY2    26205
GENHLTH         2
PHYSHLTH        1
DIFFWALK     9149
_AGE_G          0
EDUCA           0
INCOME2      2471
label           0
dtype: int64

In [59]:
df_subset = df_subset.dropna() # drop rows with NaN values
print("The shape of the dataset is now:", df_subset.shape, ".") # we still have a lot of rows

The shape of the dataset is now: (282617, 13) .


### DIABETE3
Ever told you had diabetes?
For this column, we see that among those who had diabetes, 22% had a heart attack but among those who didn't, only 6% had a heart attack, suggesting diabetes could be a useful predictor.

In [60]:
print("The values in the DIABETE3 column are:", np.unique(df_subset["DIABETE3"]), ".") # we should just keep 1 = YES and 3 = NO
diabete = df_subset["DIABETE3"]                                                        # set everything else to 9 and drop in the future
diabete[diabete > 3] = 9
diabete[diabete == 3] = 0
diabete[diabete == 2] = 9
pd.crosstab(diabete, df_subset["label"], normalize = "index")

The values in the DIABETE3 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame


label,0,1
DIABETE3,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.930487,0.069513
1.0,0.773996,0.226004
9.0,0.88366,0.11634


### BPHIGH4
Ever told by a doctor you had high blood pressure? For this column as well, we see that among those who had diabetes, 17% had a heart attack but among those who didn't, only 4% had a heart attack, suggesting high blood pressure could be a useful predictor.

In [61]:
print("The values in the HIGHBP4 column are:", np.unique(df_subset["BPHIGH4"]), ".") # we should just keep 1 = YES and 3 = NO
highbp = df_subset["BPHIGH4"]                                                        # set everything else to 9 and drop in the future
highbp[highbp > 3] = 9
highbp[highbp == 3] = 0
highbp[highbp == 2] = 9
pd.crosstab(highbp, df_subset["label"], normalize = "index")

The values in the HIGHBP4 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 2] = 9


label,0,1
BPHIGH4,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.961325,0.038675
1.0,0.833328,0.166672
9.0,0.941741,0.058259


### _BMI5
Body Mass Index. This column takes values in the positive reals. Is there a correlation between BMI (high fat percentage) and heart attacks?
It seems the correlation is somewhat weak as 8% of non obese people had a heart attack and 11% of obese people had one.

In [62]:
bmi = df_subset["_BMI5"]
print(bmi.describe()) # 75% percentile at 31, max at 98
bmi[bmi <= 15] = 1
bmi[np.logical_and(bmi > 15, bmi <= 35)] = 0 # healthy range
bmi[bmi > 35] = 1
pd.crosstab(bmi, df_subset["label"], normalize = "index")

count    282617.000000
mean         28.057042
std           6.668281
min          12.020000
25%          23.740000
50%          26.950000
75%          30.910000
max          97.650000
Name: _BMI5, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi[np.logical_and(bmi > 15, bmi <= 35)] = 0 # healthy range
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi[bmi > 35] = 1


label,0,1
_BMI5,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.912893,0.087107
1.0,0.878939,0.121061


### CVDSTRK3
Ever told you had a stroke? Seems correlation is quite high (makes sense), table below.

In [63]:
print("The values in the CVDSTRK3 column are:", np.unique(df_subset["CVDSTRK3"]), ".") # we should just keep 1 = YES and 2 = NO
stroke = df_subset["CVDSTRK3"]                                                        # set everything else to 9 and drop in the future
stroke[stroke > 2] = 9
stroke[stroke == 2] = 0
pd.crosstab(stroke, df_subset["label"], normalize = "index")

The values in the CVDSTRK3 column are: [1 2 7 9] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke[stroke == 2] = 0


label,0,1
CVDSTRK3,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.921714,0.078286
1,0.626434,0.373566
9,0.673016,0.326984


In [104]:
d = {"diabetes" : diabete, "highbp" : highbp, "unhealthybmi" : bmi, "stroke" : stroke, "labels" : df_subset["label"]}
df_4 = pd.DataFrame(data = d)
df_4 = df_4.loc[~(df_4 > 1).any(axis=1)]
print(df_subset.shape)
df_clean = df_4.drop(df_4[df_4['labels'] == 0].sample(frac=0.90).index)
print("The percentage of positive labels is now: ", sum(np.array(df_clean[["labels"]]))/len(np.array(df_clean[["labels"]])), ".")
df_clean.shape # we still have many rows especially considering the simplicity of our data, but a much more balanced dataset

(282617, 13)
The percentage of positive labels is now:  [0.49869024] .


(48864, 5)

Ok, now we have a dataset with only 0 and 1's and still around 50k rows which are more than enough for 4 predictors.

In [113]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
model = LinearRegression()
x = df_clean[["diabetes", "highbp", "unhealthybmi", "stroke"]]
y = df_clean[["labels"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
model.fit(x_train,y_train)
raw_preds = model.predict(x_test)
preds = np.rint(raw_preds)
print("Accuracy of lin reg:", 1-sum(abs(np.array(y_test)-preds))/len(y_test), ".") # not too bad for a simple model and only 4 predictors

Accuracy of lin reg: [0.69812725] .


In [114]:
# just some stats, it's good that all predictions more or less are between 0 and 1
print(max(raw_preds))
print(min(raw_preds))
print(np.mean(raw_preds))
print(np.median(raw_preds))

[1.04169249]
[0.24276673]
0.49640608833940186
0.5684905072577577
