In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [6]:
dataframe = pd.read_csv('./dataset/Hypertension-risk-model-main.csv')
dataframe.head()


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
dataframe.describe()
# Attributes:
# male (gender)
# age (age of the individual)
# currentSmoker (smoking status)
# cigsPerDay (number of cigarettes smoked per day)
# BPMeds (blood pressure medication usage)
# diabetes (diabetes status)
# totChol (total cholesterol level)
# sysBP (systolic blood pressure)
# diaBP (diastolic blood pressure)
# BMI (body mass index)
# heartRate (heart rate)
# glucose (glucose level)
# Risk (hypertension risk status)


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
count,4240.0,4240.0,4240.0,4211.0,4187.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,0.494104,9.005937,0.029615,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.310613
std,0.495027,8.572942,0.500024,11.922462,0.169544,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.462799
min,0.0,32.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,1.0
max,1.0,70.0,1.0,70.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [20]:
X = dataframe.drop(columns=['Risk'], axis=1)
y = dataframe['Risk']

dataframe['BPMeds'] = dataframe['BPMeds'].fillna(0)



In [23]:
currentSigsUnique = dataframe['currentSmoker'].unique()
currentSigsUnique


array([0, 1], dtype=int64)

In [24]:
cigsPerDay = dataframe['cigsPerDay'].unique()
cigsPerDay


array([ 0., 20., 30., 23., 15.,  9., 10.,  5., 35., 43.,  1., 40.,  3.,
        2., nan, 12.,  4., 18., 25., 60., 14., 45.,  8., 50., 13., 11.,
        7.,  6., 38., 29., 17., 16., 19., 70.])

In [26]:
cigsValueCounts = dataframe['cigsPerDay'].value_counts(dropna=False)
cigsValueCounts


cigsPerDay
0.0     2145
20.0     734
30.0     218
15.0     210
10.0     143
9.0      130
5.0      121
3.0      100
40.0      80
1.0       67
43.0      56
25.0      55
NaN       29
35.0      22
2.0       18
6.0       18
7.0       12
8.0       11
60.0      11
4.0        9
18.0       8
17.0       7
50.0       6
23.0       6
11.0       5
45.0       3
13.0       3
12.0       3
16.0       3
14.0       2
19.0       2
38.0       1
29.0       1
70.0       1
Name: count, dtype: int64

In [27]:
# CLEANED UP CIGS PER DAY COLUMN
dataframe['cigsPerDay'] = dataframe['cigsPerDay'].fillna(0.0)
cigsValueCounts = dataframe['cigsPerDay'].value_counts()
cigsValueCounts


cigsPerDay
0.0     2174
20.0     734
30.0     218
15.0     210
10.0     143
9.0      130
5.0      121
3.0      100
40.0      80
1.0       67
43.0      56
25.0      55
35.0      22
2.0       18
6.0       18
7.0       12
8.0       11
60.0      11
4.0        9
18.0       8
17.0       7
50.0       6
23.0       6
11.0       5
45.0       3
13.0       3
12.0       3
16.0       3
14.0       2
19.0       2
38.0       1
29.0       1
70.0       1
Name: count, dtype: int64

In [7]:
uniqueValuesBPMeds = dataframe['BPMeds'].unique()
uniqueValuesBPMeds


array([ 0.,  1., nan])

In [8]:
BPMedsValueCounts = dataframe['BPMeds'].value_counts(dropna=False)
BPMedsValueCounts


BPMeds
0.0    4063
1.0     124
NaN      53
Name: count, dtype: int64

In [9]:
dataframe['BPMeds'] = dataframe['BPMeds'].fillna(0.0)


In [12]:
BPMedsValueCounts = dataframe['BPMeds'].value_counts()
BPMedsValueCounts


BPMeds
0.0    4116
1.0     124
Name: count, dtype: int64

In [14]:
uniqueValuesDiabetes = dataframe['diabetes'].value_counts(dropna=False)
uniqueValuesDiabetes


diabetes
0    4131
1     109
Name: count, dtype: int64

In [16]:
uniqueValuesTotChol = dataframe['totChol'].unique()
uniqueValuesTotChol


array([195., 250., 245., 225., 285., 228., 205., 313., 260., 254., 247.,
       294., 332., 226., 221., 232., 291., 190., 185., 234., 215., 270.,
       272., 295., 209., 175., 214., 257., 178., 233., 180., 243., 237.,
        nan, 311., 208., 252., 261., 179., 194., 267., 216., 240., 266.,
       255., 220., 235., 212., 223., 300., 302., 248., 200., 189., 258.,
       202., 213., 183., 274., 170., 210., 197., 326., 188., 256., 244.,
       193., 239., 296., 269., 275., 268., 265., 173., 273., 290., 278.,
       264., 282., 241., 288., 222., 303., 246., 150., 187., 286., 154.,
       279., 293., 259., 219., 230., 320., 312., 165., 159., 174., 242.,
       301., 167., 308., 325., 229., 236., 224., 253., 464., 171., 186.,
       227., 249., 176., 163., 191., 263., 196., 310., 164., 135., 238.,
       207., 342., 287., 182., 352., 284., 217., 203., 262., 129., 155.,
       323., 206., 283., 319., 304., 340., 328., 280., 368., 218., 276.,
       339., 231., 198., 177., 201., 277., 184., 19

In [17]:
nanValuesTotChol = dataframe['totChol'].value_counts(dropna=False)
nanValuesTotChol


totChol
240.0    85
220.0    70
260.0    62
210.0    61
232.0    59
         ..
126.0     1
365.0     1
362.0     1
405.0     1
119.0     1
Name: count, Length: 249, dtype: int64

In [18]:
# putting values to NaN totCholColumn
dataframe['totChol'] = dataframe['totChol'].fillna(240.0)
nanValuesTotChol = dataframe['totChol'].value_counts()
nanValuesTotChol


totChol
240.0    135
220.0     70
260.0     62
210.0     61
232.0     59
        ... 
392.0      1
405.0      1
359.0      1
398.0      1
119.0      1
Name: count, Length: 248, dtype: int64

In [19]:
uniqueValuesTotChol = dataframe['totChol'].unique()
uniqueValuesTotChol


array([195., 250., 245., 225., 285., 228., 205., 313., 260., 254., 247.,
       294., 332., 226., 221., 232., 291., 190., 185., 234., 215., 270.,
       272., 295., 209., 175., 214., 257., 178., 233., 180., 243., 237.,
       240., 311., 208., 252., 261., 179., 194., 267., 216., 266., 255.,
       220., 235., 212., 223., 300., 302., 248., 200., 189., 258., 202.,
       213., 183., 274., 170., 210., 197., 326., 188., 256., 244., 193.,
       239., 296., 269., 275., 268., 265., 173., 273., 290., 278., 264.,
       282., 241., 288., 222., 303., 246., 150., 187., 286., 154., 279.,
       293., 259., 219., 230., 320., 312., 165., 159., 174., 242., 301.,
       167., 308., 325., 229., 236., 224., 253., 464., 171., 186., 227.,
       249., 176., 163., 191., 263., 196., 310., 164., 135., 238., 207.,
       342., 287., 182., 352., 284., 217., 203., 262., 129., 155., 323.,
       206., 283., 319., 304., 340., 328., 280., 368., 218., 276., 339.,
       231., 198., 177., 201., 277., 184., 199., 16

In [20]:
uniqueValuesSysBP = dataframe['sysBP'].unique()
uniqueValuesSysBP


array([106. , 121. , 127.5, 150. , 130. , 180. , 138. , 100. , 141.5,
       162. , 133. , 131. , 142. , 124. , 114. , 140. , 112. , 122. ,
       139. , 108. , 123.5, 148. , 132. , 137.5, 102. , 110. , 182. ,
       115. , 134. , 147. , 124.5, 153.5, 160. , 153. , 111. , 116.5,
       206. ,  96. , 179.5, 119. , 116. , 156.5, 145. , 143.5, 158. ,
       157. , 126.5, 136. , 154. , 190. , 107. , 112.5, 164.5, 138.5,
       155. , 151. , 152. , 179. , 113. , 200. , 132.5, 126. , 123. ,
       141. , 135. , 187. , 127. , 160.5, 105. , 109. , 128. , 118. ,
       109.5, 117.5, 149. , 180.5, 136.5, 212. , 125. , 191. , 121.5,
       173. , 144. , 129.5, 117. , 144.5, 170. , 137. ,  94. , 119.5,
       143. , 166. , 139.5, 177.5, 129. , 159. , 130.5, 107.5, 189. ,
       168. , 197.5, 146. , 174. , 122.5,  98. , 131.5, 195. , 101. ,
       158.5,  97. , 151.5,  97.5, 120. , 204. , 157.5, 140.5, 171. ,
       215. ,  95. , 156. , 165. , 178. , 146.5, 113.5, 188. , 197. ,
        90. , 152.5,

In [None]:
unique


In [21]:
one_occurrence = 0
zero_occurrence = 0
na_occurrence = 0

for row in X.BPMeds:
    if row == 1:
      one_occurrence += 1
    elif row == 0:
      zero_occurrence += 1
    else:
      na_occurrence += 1


print("Number of 1s: ", one_occurrence)
print("Number of 0s: ", zero_occurrence)
print("Number of NAs: ", na_occurrence)


Number of 1s:  124
Number of 0s:  4116
Number of NAs:  0
