In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [66]:
dataframe = pd.read_csv('./dataset/Hypertension-risk-model-main.csv')
dataframe.head()


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Attributes:
# male (gender)
# age (age of the individual)
# currentSmoker (smoking status)
# cigsPerDay (number of cigarettes smoked per day)
# BPMeds (blood pressure medication usage)
# diabetes (diabetes status)
# totChol (total cholesterol level)
# sysBP (systolic blood pressure)
# diaBP (diastolic blood pressure)
# BMI (body mass index)
# heartRate (heart rate)
# glucose (glucose level)
# Risk (hypertension risk status)
dataframe.describe()


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,Risk
count,4240.0,4240.0,4240.0,4211.0,4187.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,0.494104,9.005937,0.029615,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.310613
std,0.495027,8.572942,0.500024,11.922462,0.169544,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.462799
min,0.0,32.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,1.0
max,1.0,70.0,1.0,70.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [8]:
# unique values for the gender column
unique_values_gender = dataframe.male.value_counts(dropna=False)
unique_values_gender


male
0    0.570755
1    0.429245
Name: proportion, dtype: float64

In [10]:
# list of all unique values for gender column
unique_values_gender_list = dataframe['male'].unique()
unique_values_gender_list


array([1, 0], dtype=int64)

In [11]:
# unique values for the age column
unique_values_age = dataframe.age.value_counts(dropna=False)
unique_values_age


age
40    192
46    182
42    180
41    174
48    173
39    170
44    166
45    162
43    159
52    149
51    146
55    145
38    144
47    141
50    140
53    139
54    132
49    132
57    123
56    123
59    119
58    117
60    111
63    110
61    110
62     99
64     93
37     92
36     84
65     57
67     45
35     42
66     38
34     18
68     18
69      7
33      5
70      2
32      1
Name: count, dtype: int64

In [12]:
unique_values_age_list = dataframe['age'].unique()
unique_values_age_list


array([39, 46, 48, 61, 43, 63, 45, 52, 50, 41, 38, 42, 44, 47, 60, 35, 36,
       59, 54, 37, 56, 53, 49, 65, 51, 62, 40, 67, 57, 66, 64, 55, 58, 34,
       68, 33, 70, 32, 69], dtype=int64)

In [14]:
unique_values_current_smoker = dataframe.currentSmoker.value_counts(dropna=False)
unique_values_current_smoker


currentSmoker
0    2145
1    2095
Name: count, dtype: int64

In [15]:
unique_values_current_smoker_list = dataframe['currentSmoker'].unique()
unique_values_current_smoker_list


array([0, 1], dtype=int64)

In [16]:
unique_values_ciagrette_per_day = dataframe.cigsPerDay.value_counts(dropna=False)
unique_values_ciagrette_per_day


cigsPerDay
0.0     2145
20.0     734
30.0     218
15.0     210
10.0     143
9.0      130
5.0      121
3.0      100
40.0      80
1.0       67
43.0      56
25.0      55
NaN       29
35.0      22
2.0       18
6.0       18
7.0       12
8.0       11
60.0      11
4.0        9
18.0       8
17.0       7
50.0       6
23.0       6
11.0       5
45.0       3
13.0       3
12.0       3
16.0       3
14.0       2
19.0       2
38.0       1
29.0       1
70.0       1
Name: count, dtype: int64

In [17]:
# getting the average of the cigsPerDay column
average_cigsPerDay = dataframe.cigsPerDay.mean()
average_cigsPerDay


9.005936832106388

In [18]:
dataframe['cigsPerDay'].fillna(average_cigsPerDay, inplace=True)
dataframe['cigsPerDay'].isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['cigsPerDay'].fillna(average_cigsPerDay, inplace=True)


0

In [19]:
unique_values_cigarette_per_day_list = dataframe.cigsPerDay.value_counts(dropna=False)
unique_values_cigarette_per_day_list


cigsPerDay
0.000000     2145
20.000000     734
30.000000     218
15.000000     210
10.000000     143
9.000000      130
5.000000      121
3.000000      100
40.000000      80
1.000000       67
43.000000      56
25.000000      55
9.005937       29
35.000000      22
2.000000       18
6.000000       18
7.000000       12
8.000000       11
60.000000      11
4.000000        9
18.000000       8
17.000000       7
50.000000       6
23.000000       6
11.000000       5
45.000000       3
13.000000       3
12.000000       3
16.000000       3
14.000000       2
19.000000       2
38.000000       1
29.000000       1
70.000000       1
Name: count, dtype: int64

In [20]:
unique_values_BP_meds = dataframe.BPMeds.value_counts(dropna=False)
unique_values_BP_meds


BPMeds
0.0    4063
1.0     124
NaN      53
Name: count, dtype: int64

In [21]:
unique_values_BP_meds_list = dataframe['BPMeds'].unique()
unique_values_BP_meds_list


array([ 0.,  1., nan])

In [22]:
dataframe['BPMeds'].fillna(0.0, inplace=True)
dataframe['BPMeds'].isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['BPMeds'].fillna(0.0, inplace=True)


0

In [23]:
unique_values_diabetes = dataframe.diabetes.value_counts(dropna=False)
unique_values_diabetes


diabetes
0    4131
1     109
Name: count, dtype: int64

In [24]:
unique_values_diabetes_list = dataframe['diabetes'].unique()
unique_values_diabetes_list


array([0, 1], dtype=int64)

In [25]:
unique_values_total_cholesterol = dataframe.totChol.value_counts(dropna=False)
unique_values_total_cholesterol


totChol
240.0    85
220.0    70
260.0    62
210.0    61
232.0    59
250.0    57
200.0    56
225.0    54
230.0    54
205.0    53
NaN      50
195.0    48
246.0    48
235.0    47
270.0    47
215.0    47
245.0    45
212.0    44
190.0    43
214.0    43
229.0    42
239.0    42
254.0    42
226.0    40
234.0    40
238.0    39
252.0    38
219.0    38
258.0    37
193.0    36
185.0    36
242.0    35
222.0    35
241.0    35
216.0    34
253.0    34
248.0    34
275.0    33
273.0    33
266.0    33
206.0    33
265.0    33
285.0    33
233.0    33
237.0    32
262.0    32
243.0    32
213.0    32
231.0    32
202.0    31
180.0    31
197.0    30
280.0    30
261.0    30
207.0    29
217.0    29
218.0    29
255.0    29
208.0    29
227.0    28
228.0    27
199.0    27
203.0    27
175.0    27
201.0    25
196.0    25
186.0    25
165.0    25
224.0    25
259.0    25
272.0    25
211.0    25
221.0    25
223.0    25
256.0    24
300.0    24
271.0    24
268.0    23
209.0    23
274.0    23
290.0    23
188.0    23
244.0   

In [28]:
average_total_cholesterol = dataframe.totChol.mean()
truncated_average_totChol = average_total_cholesterol.round(2)
truncated_average_totChol


236.7

In [29]:
dataframe['totChol'].fillna(truncated_average_totChol, inplace=True)
dataframe['totChol'].isnull().sum()


0

In [30]:
unique_values_total_cholesterol_list = dataframe.totChol.value_counts(dropna=False)
unique_values_total_cholesterol_list


totChol
240.0    85
220.0    70
260.0    62
210.0    61
232.0    59
250.0    57
200.0    56
225.0    54
230.0    54
205.0    53
236.7    50
195.0    48
246.0    48
235.0    47
270.0    47
215.0    47
245.0    45
212.0    44
190.0    43
214.0    43
229.0    42
239.0    42
254.0    42
226.0    40
234.0    40
238.0    39
252.0    38
219.0    38
258.0    37
193.0    36
185.0    36
242.0    35
222.0    35
241.0    35
216.0    34
253.0    34
248.0    34
275.0    33
273.0    33
266.0    33
206.0    33
265.0    33
285.0    33
233.0    33
237.0    32
262.0    32
243.0    32
213.0    32
231.0    32
202.0    31
180.0    31
197.0    30
280.0    30
261.0    30
207.0    29
217.0    29
218.0    29
255.0    29
208.0    29
227.0    28
228.0    27
199.0    27
203.0    27
175.0    27
201.0    25
196.0    25
186.0    25
165.0    25
224.0    25
259.0    25
272.0    25
211.0    25
221.0    25
223.0    25
256.0    24
300.0    24
271.0    24
268.0    23
209.0    23
274.0    23
290.0    23
188.0    23
244.0   

In [31]:
unique_values_systolic_BP = dataframe.sysBP.value_counts(dropna=False)
unique_values_systolic_BP


sysBP
120.0    107
130.0    102
110.0     96
115.0     89
125.0     88
124.0     84
122.0     80
126.0     73
128.0     73
123.0     72
116.0     71
135.0     69
132.0     69
119.0     68
114.0     65
129.0     63
131.0     62
118.0     62
127.0     61
121.0     60
134.0     58
111.0     55
112.0     55
133.0     54
140.0     53
113.0     52
127.5     50
141.0     50
138.0     50
108.0     49
145.0     49
139.0     45
150.0     45
142.0     45
137.0     44
112.5     44
105.0     43
144.0     43
148.0     42
146.0     40
117.5     39
107.0     38
117.0     38
109.0     37
155.0     37
136.0     36
122.5     32
102.0     31
107.5     30
132.5     28
106.0     28
158.0     28
160.0     27
165.0     26
100.0     26
143.0     26
136.5     25
147.0     25
154.0     24
159.0     24
149.0     23
126.5     21
101.0     21
152.0     21
104.0     21
151.0     20
170.0     20
137.5     20
103.0     19
153.0     19
118.5     19
131.5     19
147.5     18
142.5     18
157.0     17
164.0     17
121.5 

In [32]:
unique_values_systolic_BP_list = dataframe['sysBP'].unique()
unique_values_systolic_BP_list


array([106. , 121. , 127.5, 150. , 130. , 180. , 138. , 100. , 141.5,
       162. , 133. , 131. , 142. , 124. , 114. , 140. , 112. , 122. ,
       139. , 108. , 123.5, 148. , 132. , 137.5, 102. , 110. , 182. ,
       115. , 134. , 147. , 124.5, 153.5, 160. , 153. , 111. , 116.5,
       206. ,  96. , 179.5, 119. , 116. , 156.5, 145. , 143.5, 158. ,
       157. , 126.5, 136. , 154. , 190. , 107. , 112.5, 164.5, 138.5,
       155. , 151. , 152. , 179. , 113. , 200. , 132.5, 126. , 123. ,
       141. , 135. , 187. , 127. , 160.5, 105. , 109. , 128. , 118. ,
       109.5, 117.5, 149. , 180.5, 136.5, 212. , 125. , 191. , 121.5,
       173. , 144. , 129.5, 117. , 144.5, 170. , 137. ,  94. , 119.5,
       143. , 166. , 139.5, 177.5, 129. , 159. , 130.5, 107.5, 189. ,
       168. , 197.5, 146. , 174. , 122.5,  98. , 131.5, 195. , 101. ,
       158.5,  97. , 151.5,  97.5, 120. , 204. , 157.5, 140.5, 171. ,
       215. ,  95. , 156. , 165. , 178. , 146.5, 113.5, 188. , 197. ,
        90. , 152.5,

In [33]:
unqiue_values_diastolic_BP = dataframe.diaBP.value_counts(dropna=False)
unqiue_values_diastolic_BP


diaBP
80.0     262
82.0     152
85.0     137
70.0     135
81.0     131
84.0     122
90.0     119
78.0     116
87.0     113
86.0     108
75.0     108
88.0     106
79.0     106
74.0     102
83.0      94
76.0      93
72.0      91
73.0      91
92.0      77
89.0      76
71.0      73
77.0      72
94.0      65
69.0      62
72.5      61
91.0      59
95.0      57
93.0      56
77.5      51
98.0      50
100.0     46
68.0      46
82.5      46
96.0      45
67.0      43
65.0      37
97.0      36
85.5      35
66.0      34
102.0     33
99.0      32
92.5      31
105.0     30
60.0      28
67.5      27
86.5      26
87.5      26
64.0      25
110.0     24
101.0     23
62.0      21
75.5      20
66.5      20
76.5      20
83.5      19
73.5      18
61.0      17
108.0     16
97.5      16
78.5      15
74.5      15
109.0     14
103.0     14
84.5      14
81.5      13
106.0     13
107.0     13
70.5      13
104.0     13
89.5      12
96.5      11
80.5      11
79.5      11
63.0      11
59.0      11
88.5      10
95.5  

In [34]:
unique_values_diastolic_BP_list = dataframe['diaBP'].unique()
unique_values_diastolic_BP_list


array([ 70. ,  81. ,  80. ,  95. ,  84. , 110. ,  71. ,  89. , 107. ,
        76. ,  88. ,  94. ,  64. ,  90. ,  78. ,  84.5,  70.5,  77.5,
        82. ,  68. ,  72.5,  91. , 121. ,  85.5,  85. ,  82.5,  74. ,
        92.5, 102. ,  98. , 101. ,  73. ,  92. ,  83.5,  63. , 114. ,
        69. ,  93. ,  66. ,  75. ,  79. ,  87. ,  99. ,  60. ,  67.5,
       106. ,  86.5, 104. ,  86. ,  61.5,  71.5,  76.5,  77. ,  88.5,
       105. ,  96. ,  97. , 100. ,  81.5, 106.5,  80.5, 124.5,  61. ,
        83. ,  67. ,  74.5,  66.5,  65. ,  72. ,  99.5, 122.5,  57. ,
        57.5, 111. ,  78.5, 104.5,  89.5, 112. ,  55. , 123. , 120. ,
        75.5, 118. ,  97.5,  59. , 133. ,  69.5,  95.5,  96.5, 135. ,
        64.5,  68.5,  98.5,  62. , 117. ,  59.5, 103. , 108.5,  73.5,
        87.5, 108. ,  93.5,  90.5, 114.5,  62.5,  94.5, 140. , 124. ,
        79.5, 109. ,  91.5, 115. , 102.5,  65.5, 105.5, 103.5,  63.5,
       107.5, 142.5, 109.5,  58. , 117.5, 116.5, 100.5, 116. , 119. ,
        54. , 132. ,

In [35]:
unique_values_BMI = dataframe.BMI.value_counts(dropna=False)
unique_values_BMI


BMI
NaN      19
22.91    18
22.54    18
23.48    18
22.19    18
23.09    16
25.09    16
23.10    13
22.73    13
25.23    13
27.78    12
22.90    12
25.94    12
22.01    12
23.68    12
27.73    12
21.51    12
24.56    11
22.72    11
26.09    11
24.35    11
23.29    11
24.10    11
25.38    11
28.30    11
21.35    10
23.08    10
25.74    10
21.99    10
26.36    10
25.82    10
22.17    10
29.35    10
26.25    10
26.84    10
20.12     9
26.79     9
22.89     9
23.88     9
22.36     9
23.72     9
26.27     9
28.04     9
24.38     9
22.02     9
24.22     9
28.09     9
26.98     9
22.18     9
27.51     9
27.49     9
24.67     9
25.62     9
27.94     9
26.89     9
23.06     8
24.16     8
28.35     8
25.14     8
23.22     8
29.29     8
26.91     8
24.24     8
24.39     8
28.57     8
27.38     8
24.01     8
26.73     8
25.48     8
25.13     8
21.19     8
24.87     8
26.58     8
26.51     8
27.27     8
25.71     8
23.95     8
27.22     8
26.77     8
23.65     8
24.50     8
27.01     8
22.53     8


In [38]:
average_BMI = dataframe.BMI.mean()
truncated_average_BMI = average_BMI.round(2)
truncated_average_BMI


25.8

In [39]:
dataframe['BMI'].fillna(truncated_average_BMI, inplace=True)
dataframe['BMI'].isnull().sum()


0

In [41]:
unique_values_BMI_list = dataframe.BMI.unique()
unique_values_BMI_list


array([26.97, 28.73, 25.34, ..., 26.7 , 43.67, 20.91])

In [42]:
unique_values_heart_rate = dataframe.heartRate.value_counts(dropna=False)
unique_values_heart_rate


heartRate
75.0     563
80.0     385
70.0     305
60.0     231
85.0     228
72.0     222
65.0     197
90.0     172
68.0     151
100.0     98
67.0      94
63.0      92
95.0      90
78.0      88
66.0      82
82.0      77
88.0      70
73.0      68
62.0      63
76.0      57
64.0      56
58.0      52
83.0      51
77.0      51
69.0      50
92.0      48
86.0      45
96.0      39
79.0      39
71.0      36
55.0      36
110.0     36
74.0      31
87.0      31
84.0      26
94.0      25
56.0      22
50.0      22
57.0      18
52.0      17
98.0      17
81.0      15
105.0     14
54.0      12
53.0      11
93.0      10
89.0       9
102.0      9
108.0      8
61.0       7
120.0      7
91.0       7
115.0      5
103.0      5
48.0       5
59.0       5
107.0      4
104.0      3
112.0      3
125.0      3
122.0      2
45.0       2
106.0      2
44.0       1
47.0       1
51.0       1
97.0       1
NaN        1
130.0      1
99.0       1
140.0      1
143.0      1
101.0      1
46.0       1
Name: count, dtype: int64

In [45]:
average_heart_rate = dataframe.heartRate.mean()
truncated_average_heart_rate = average_heart_rate.round()
truncated_average_heart_rate


76.0

In [46]:
dataframe['heartRate'].fillna(truncated_average_heart_rate, inplace=True)
dataframe['heartRate'].isnull().sum()


0

In [47]:
unique_values_heart_rate_list = dataframe.heartRate.unique()
unique_values_heart_rate_list


array([ 80.,  95.,  75.,  65.,  85.,  77.,  60.,  79.,  76.,  93.,  72.,
        98.,  64.,  70.,  71.,  62.,  73.,  90.,  96.,  68.,  63.,  88.,
        78.,  83., 100.,  67.,  84.,  57.,  50.,  74.,  86.,  55.,  92.,
        66.,  87., 110.,  81.,  56.,  89.,  82.,  48., 105.,  61.,  54.,
        69.,  52.,  94., 140., 130.,  58., 108., 104.,  91.,  53., 106.,
        59.,  51., 102., 107., 112., 125., 103.,  44.,  47.,  45.,  97.,
       122., 120.,  99., 115., 143., 101.,  46.])

In [48]:
unique_values_glucose = dataframe.glucose.value_counts(dropna=False)
unique_values_glucose


glucose
NaN      388
75.0     193
77.0     167
73.0     156
80.0     153
70.0     152
83.0     151
78.0     148
74.0     141
85.0     127
76.0     127
87.0     114
72.0     108
67.0     107
84.0     107
82.0     100
79.0      96
68.0      93
65.0      85
90.0      81
71.0      77
88.0      74
69.0      65
60.0      63
63.0      63
86.0      62
66.0      61
93.0      60
81.0      59
100.0     48
95.0      48
64.0      46
62.0      43
92.0      38
94.0      38
89.0      34
97.0      33
103.0     32
91.0      29
96.0      27
98.0      24
61.0      22
57.0      21
99.0      21
58.0      20
102.0     19
115.0     15
107.0     15
108.0     14
104.0     14
55.0      13
113.0     13
105.0     12
112.0     11
59.0      11
106.0     10
117.0     10
118.0     10
110.0      9
120.0      9
114.0      5
116.0      5
53.0       5
123.0      5
54.0       5
56.0       5
137.0      4
101.0      4
126.0      4
140.0      4
45.0       4
127.0      4
132.0      3
109.0      3
50.0       3
122.0      3
47.0

In [51]:
average_glucose = dataframe.glucose.mean()
truncated_average_glucose = average_glucose.round()
truncated_average_glucose


82.0

In [52]:
dataframe['glucose'].fillna(truncated_average_glucose, inplace=True)
dataframe['glucose'].isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['glucose'].fillna(truncated_average_glucose, inplace=True)


0

In [56]:
unique_values_glucose_list = dataframe.glucose.unique()
unique_values_glucose_list


array([ 77.,  76.,  70., 103.,  85.,  99.,  78.,  79.,  88.,  61.,  64.,
        84.,  82.,  72.,  89.,  65., 113.,  75.,  83.,  66.,  74.,  63.,
        87., 225.,  90.,  80., 100., 215.,  98.,  62.,  95.,  94.,  55.,
        93.,  73.,  45., 202.,  68.,  97., 104.,  96., 126., 120., 105.,
        71.,  56.,  60., 117., 102.,  58.,  92., 109.,  86., 107.,  54.,
        67.,  69.,  57.,  91., 132., 150.,  59.,  81., 115., 140., 112.,
       118., 143., 114., 160., 110., 123., 108., 145., 122., 137., 106.,
       127., 205., 130., 101.,  47.,  53., 216., 163., 144., 116., 121.,
       172., 124., 111.,  40., 186., 223., 325.,  44., 156., 268.,  50.,
       274., 292., 255., 136., 206., 131., 148., 297.,  43., 173.,  48.,
       386., 155., 147., 170.,  52., 320., 254., 394., 270., 244., 183.,
       142., 119., 135., 167., 207., 129., 177., 250., 294., 166., 125.,
       332., 368., 348., 248., 370., 193., 191., 256., 235., 210., 260.])

In [57]:
unique_values_risk = dataframe.Risk.value_counts(dropna=False)
unique_values_risk


Risk
0    2923
1    1317
Name: count, dtype: int64

In [58]:
unique_values_risk_list = dataframe['Risk'].unique()
unique_values_risk_list


array([0, 1], dtype=int64)

In [68]:
X = dataframe.drop('Risk', axis=1)
# Reset display options to their default values
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

X


Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,39,0,0.0,0.0,0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,0,0.0,0.0,0,250.0,121.0,81.0,28.73,95.0,76.0
2,1,48,1,20.0,0.0,0,245.0,127.5,80.0,25.34,75.0,70.0
3,0,61,1,30.0,0.0,0,225.0,150.0,95.0,28.58,65.0,103.0
4,0,46,1,23.0,0.0,0,285.0,130.0,84.0,23.10,85.0,85.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4235,0,48,1,20.0,,0,248.0,131.0,72.0,22.00,84.0,86.0
4236,0,44,1,15.0,0.0,0,210.0,126.5,87.0,19.16,86.0,
4237,0,52,0,0.0,0.0,0,269.0,133.5,83.0,21.47,80.0,107.0
4238,1,40,0,0.0,0.0,0,185.0,141.0,98.0,25.60,67.0,72.0


In [69]:
y = dataframe['Risk']
y


0       0
1       0
2       0
3       1
4       0
       ..
4235    0
4236    0
4237    0
4238    1
4239    0
Name: Risk, Length: 4240, dtype: int64

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [84]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)



In [85]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Decision Tree Classifier
y_pred_dt = dt.predict(X_test)
print("Decision Tree Classifier:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("Accuracy Score:", accuracy_score(y_test, y_pred_dt))
print("Precision Score:", precision_score(y_test, y_pred_dt, average='weighted'))
print("Recall Score:", recall_score(y_test, y_pred_dt, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_dt, average='weighted'))
print("\n")

# Random Forest Classifier
y_pred_rf = rf.predict(X_test)
print("Random Forest Classifier:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print("Precision Score:", precision_score(y_test, y_pred_rf, average='weighted'))
print("Recall Score:", recall_score(y_test, y_pred_rf, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred_rf, average='weighted'))
print("\n")


Decision Tree Classifier:
Confusion Matrix:
[[777 110]
 [ 96 289]]
Accuracy Score: 0.8380503144654088
Precision Score: 0.8398743162126284
Recall Score: 0.8380503144654088
F1 Score: 0.8388521757330719


Random Forest Classifier:
Confusion Matrix:
[[811  76]
 [ 47 338]]
Accuracy Score: 0.9033018867924528
Precision Score: 0.9062382809819233
Recall Score: 0.9033018867924528
F1 Score: 0.9042532249758324


