In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.io as pio
import plotly.express as px
from IPython.display import Image
import seaborn as sns
import matplotlib.pyplot as plt

## Some general information on our Data

In [None]:
x_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_train.csv"))
y_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/y_train.csv"))
print('The shape of x_train is', x_train.shape)
print('The shape of y_train is', y_train.shape)
# print('The shape of x_test is', x_test.shape)

The shape of x_train is (328135, 321)
The shape of y_train is (328135, 1)


In [None]:
labels = y_train["_MICHD"]
print("The only values that appear in the labels are", np.unique(labels), ".") # only 0 or 1 appear in the labels, as expected
print("y_train has", labels.isna().sum(), "missing values.") # y_train doesn't have missing values
print("The percentage of positive labels is", sum(labels) / len(labels) * 100, "%.")

The only values that appear in the labels are [0 1] .
y_train has 0 missing values.
The percentage of positive labels is 8.830207079403294 %.


## Analysis of some "interesting" columns

Some interesting columns could be the following (taken from Kaggle): 'DIABETE3', 'BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3', 'CVDINFR4', 'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK', '_AGE_G', 'EDUCA', 'INCOME2'.


In [None]:
df = x_train.assign(label = y_train) # adds labels as last column of x_train to have complete training data
df.shape # 322 columns as expected
l = df.columns
print(l)

df.columns.get_loc("GENHLTH")

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM',
       ...
       '_PASTAE1', '_LMTACT1', '_LMTWRK1', '_LMTSCL1', '_RFSEAT2', '_RFSEAT3',
       '_FLSHOT6', '_PNEUMO2', '_AIDTST3', 'label'],
      dtype='object', length=322)


26

In [None]:
df_subset = df[['DIABETE3','BPHIGH4', 'BLOODCHO', '_BMI5', 'CVDSTRK3',
                'EXERANY2', 'GENHLTH', 'PHYSHLTH', 'DIFFWALK',
                '_AGE_G', 'EDUCA', 'INCOME2', 'label']]
df_subset.shape
df_subset.isna().sum()

DIABETE3        5
BPHIGH4         1
BLOODCHO        0
_BMI5       27073
CVDSTRK3        0
EXERANY2    26205
GENHLTH         2
PHYSHLTH        1
DIFFWALK     9149
_AGE_G          0
EDUCA           0
INCOME2      2471
label           0
dtype: int64

In [None]:
df_subset = df_subset.dropna() # drop rows with NaN values
print("The shape of the dataset is now:", df_subset.shape, ".") # we still have a lot of rows

The shape of the dataset is now: (282617, 13) .


### DIABETE3
Ever told you had diabetes?
For this column, we see that among those who had diabetes, 22% had a heart attack but among those who didn't, only 6% had a heart attack, suggesting diabetes could be a useful predictor.

In [None]:
print("The values in the DIABETE3 column are:", np.unique(df_subset["DIABETE3"]), ".") # we should just keep 1 = YES and 3 = NO
diabete = df_subset["DIABETE3"]                                                        # set everything else to 9 and drop in the future
diabete[diabete > 3] = 9
diabete[diabete == 3] = 0
diabete[diabete == 2] = 9
pd.crosstab(diabete, df_subset["label"], normalize = "index")

The values in the DIABETE3 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabete[diabete == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame


label,0,1
DIABETE3,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.930487,0.069513
1.0,0.773996,0.226004
9.0,0.88366,0.11634


### BPHIGH4
Ever told by a doctor you had high blood pressure? For this column as well, we see that among those who had diabetes, 17% had a heart attack but among those who didn't, only 4% had a heart attack, suggesting high blood pressure could be a useful predictor.

In [None]:
print("The values in the HIGHBP4 column are:", np.unique(df_subset["BPHIGH4"]), ".") # we should just keep 1 = YES and 3 = NO
highbp = df_subset["BPHIGH4"]                                                        # set everything else to 9 and drop in the future
highbp[highbp > 3] = 9
highbp[highbp == 3] = 0
highbp[highbp == 2] = 9
pd.crosstab(highbp, df_subset["label"], normalize = "index")

The values in the HIGHBP4 column are: [1. 2. 3. 4. 7. 9.] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp > 3] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highbp[highbp == 3] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the

label,0,1
BPHIGH4,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.961325,0.038675
1.0,0.833328,0.166672
9.0,0.941741,0.058259


### _BMI5
Body Mass Index. This column takes values in the positive reals. Is there a correlation between BMI (high fat percentage) and heart attacks?
It seems the correlation is somewhat weak as 8% of non obese people had a heart attack and 11% of obese people had one.

In [None]:
bmi = df_subset["_BMI5"]
print(bmi.describe()) # 75% percentile at 31, max at 98
bmi[bmi <= 25] = 0
# bmi[np.logical_and(bmi > 15, bmi <= 35)] = 0 # healthy range
bmi[bmi > 25] = 1
pd.crosstab(bmi, df_subset["label"], normalize = "index")

count    282617.000000
mean         28.057042
std           6.668281
min          12.020000
25%          23.740000
50%          26.950000
75%          30.910000
max          97.650000
Name: _BMI5, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi[bmi > 25] = 1


label,0,1
_BMI5,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.931455,0.068545
1.0,0.897295,0.102705


### CVDSTRK3
Ever told you had a stroke? Seems correlation is quite high (makes sense), table below.

In [None]:
print("The values in the CVDSTRK3 column are:", np.unique(df_subset["CVDSTRK3"]), ".") # we should just keep 1 = YES and 2 = NO
stroke = df_subset["CVDSTRK3"]                                                        # set everything else to 9 and drop in the future
stroke[stroke > 2] = 9
stroke[stroke == 2] = 0
pd.crosstab(stroke, df_subset["label"], normalize = "index")

The values in the CVDSTRK3 column are: [1 2 7 9] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke[stroke == 2] = 0


label,0,1
CVDSTRK3,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.921714,0.078286
1,0.626434,0.373566
9,0.673016,0.326984


### GENHLTH
Would you say that in general your health is... (vague). \
1 = Excellent \
2 = Very good \
3 = Good \
4 = Fair \
5 = Poor \

As health gets worse, the percentages of heart attacks increases, as expected.


In [None]:
print("The values in the GENHLTH column are:", np.unique(df_subset["GENHLTH"]), ".") # we should just keep 1 = YES and 2 = NO
health = df_subset["GENHLTH"]                                                        # set everything else to 9 and drop in the future
health[health > 5] = 9
pd.crosstab(health, df_subset["label"], normalize = "index")

The values in the GENHLTH column are: [1. 2. 3. 4. 5. 7. 9.] .


label,0,1
GENHLTH,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.978743,0.021257
2.0,0.955925,0.044075
3.0,0.904246,0.095754
4.0,0.801031,0.198969
5.0,0.675818,0.324182
9.0,0.835277,0.164723


### Income2
Is your annual household income from all sources...
1 = below 10k \
2 = 10k - 15k \
... \
8 = more than 75k

In [None]:
print("The values in the INCOME2 column are:", np.unique(df_subset["INCOME2"]), ".") # we should just keep 1 = YES and 2 = NO
income = df_subset["INCOME2"]                                                        # set everything else to 9 and drop in the future
income[income > 8] = 9
pd.crosstab(income, df_subset["label"], normalize = "index")

The values in the INCOME2 column are: [ 1.  2.  3.  4.  5.  6.  7.  8. 77. 99.] .


label,0,1
INCOME2,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.861977,0.138023
2.0,0.830853,0.169147
3.0,0.861586,0.138414
4.0,0.87522,0.12478
5.0,0.888957,0.111043
6.0,0.909104,0.090896
7.0,0.927632,0.072368
8.0,0.950325,0.049675
9.0,0.898962,0.101038


### _AGE_G
Age groups. Group them as old (> 65) and not old for now and see what happens.

In [None]:
print("The values in the _AGE_G column are:", np.unique(df_subset["_AGE_G"]), ".")
age = df_subset["_AGE_G"]
age[age < 6] = 0
age[age == 6] = 1
pd.crosstab(age, df_subset["label"], normalize = "index")

The values in the _AGE_G column are: [1 2 3 4 5 6] .


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age[age == 6] = 1


label,0,1
_AGE_G,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.95212,0.04788
1,0.829866,0.170134


### PHYSHLTH
Now thinking about your physical health, which includes physical illness and injury, for how many days during the past
30 days was your physical health not good?
1-30: number of days
88: none (which means always healthy)
Drop the rest.

In [None]:
days = df_subset["PHYSHLTH"]
days[days == 88] = 0
pd.crosstab(days, df_subset["label"], normalize = "index")

label,0,1
PHYSHLTH,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.938332,0.061668
1.0,0.949021,0.050979
2.0,0.926123,0.073877
3.0,0.90628,0.09372
4.0,0.890046,0.109954
5.0,0.888792,0.111208
6.0,0.844183,0.155817
7.0,0.893957,0.106043
8.0,0.857303,0.142697
9.0,0.87037,0.12963


In [None]:
d = {"diabetes" : diabete, "highbp" : highbp, "stroke" : stroke,
     "health": health, "old" : age, "unhealthybmi": bmi, "labels" : df_subset["label"]}
df_temp = pd.DataFrame(data = d)
df_temp = df_temp.loc[~(df_temp > 5).any(axis=1)]
df_temp = df_temp.assign(days = days)
df_temp = df_temp.loc[~(df_temp > 30).any(axis=1)]
print(df_temp.shape)
df_clean = df_temp.drop(df_temp[df_temp['labels'] == 0].sample(frac=0.92).index) # drop some negative labels
print("The percentage of positive labels is now: ", sum(np.array(df_clean[["labels"]]))/len(np.array(df_clean[["labels"]])), ".")
df_clean.shape # we still have many rows especially considering the simplicity of our data, but a much more balanced dataset

(263699, 8)
The percentage of positive labels is now:  [0.5493529] .


(42652, 8)

Ok, now we have a dataset with only 0 and 1's and still around 50k rows which are more than enough for 4 predictors.

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
linreg = LinearRegression()
x = df_clean[["diabetes", "highbp", "days", "stroke", "health", "old"]]
y = df_clean[["labels"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
linreg.fit(x_train,y_train)
linreg_raw_preds = linreg.predict(x_test)
linreg_preds = np.rint(linreg_raw_preds)
print("Accuracy of lin reg:", 1-sum(abs(np.array(y_test)-linreg_preds))/len(y_test), ".") # not too bad for a simple model and only 5 predictors

Accuracy of lin reg: [0.75745951] .


In [None]:
logreg = LogisticRegression(penalty = "l2", verbose = 1)
logreg.fit(x_train, y_train)
score = logreg.score(x_test, y_test)
print(score)

  y = column_or_1d(y, warn=True)


0.7588803637396988


# Analyze test data

In [None]:
x_test = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/x_test.csv"))
print("Shape of test data:", x_test.shape, ".")
print("Missing values in x_test:", x_test.isna().sum().sum(), ".")
test_sub = x_test[['DIABETE3','BPHIGH4', 'CVDSTRK3',
                'GENHLTH','_AGE_G', "PHYSHLTH"]]
print("Missing values in subset of x_test:\n", test_sub.isna().sum()) # drop bmi from predictions? that way only 2 missing values --> input them
# print("The values in the _AGE_G column are:", np.unique(test_sub["BPHIGH4"]), ".")

Shape of test data: (109379, 321) .
Missing values in x_test: 15724379 .
Missing values in subset of x_test:
 DIABETE3    2
BPHIGH4     0
CVDSTRK3    0
GENHLTH     0
_AGE_G      0
PHYSHLTH    0
dtype: int64


In [None]:
test_sub[test_sub.isna()] = -1
test_sub[test_sub > 30] = -1
test_sub = test_sub.rename(columns={"BPHIGH4": "highbp", "CVDSTRK3": "stroke",
                         "DIABETE3": "diabetes", "GENHLTH": "health",
                         "PHYSHLTH": "days", "_AGE_G": "old"})
test_sub = test_sub[["diabetes", "highbp", "days", "stroke", "health", "old"]]
test_sub["diabetes"][test_sub["diabetes"] > 3] = -1
test_sub["diabetes"][test_sub["diabetes"] == 3] = 0
test_sub["diabetes"][test_sub["diabetes"] == 2] = -1
test_sub["highbp"][test_sub["highbp"] > 3] = 0
test_sub["highbp"][test_sub["highbp"] == 3] = -1
test_sub["highbp"][test_sub["highbp"] == 2] = -1
test_sub["stroke"][test_sub["stroke"] > 2] = -1
test_sub["stroke"][test_sub["stroke"] == 2] = 0
test_sub["health"][test_sub["health"] > 5] = -1
test_sub["old"][test_sub["old"] < 6] = 0
test_sub["old"][test_sub["old"] == 6] = 1
test_sub.head()
y_test_preds = logreg.predict(test_sub)
np.savetxt("y_test_preds.csv", y_test_preds, delimiter=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_sub[test_sub.isna()] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_sub[test_sub.isna()] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_sub[test_sub > 30] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [None]:
submission = pd.DataFrame({"Id": x_test.index+1, "Prediction" : y_test_preds})
submission.to_csv('drive/MyDrive/MLProject1/submission.csv', index=False)

In [None]:
y_train = pd.DataFrame(pd.read_csv("drive/MyDrive/MLProject1/y_train.csv"))
len(y_train)+len(y_test_preds)

437514