In [164]:
import pandas as pd

In [165]:
df=pd.read_csv("heart_attack_risk_dataset.csv")

In [166]:
columns_to_drop = [
    "Chest_Pain_Type", "Thalassemia", "Fasting_Blood_Sugar", 
    "ECG_Results", "Exercise_Induced_Angina", "Max_Heart_Rate_Achieved"
]

# Drop the columns
new_data_cleaned = df.drop(columns=columns_to_drop, errors='ignore')


In [167]:
new_data_cleaned.columns

Index(['Age', 'Gender', 'Smoking', 'Alcohol_Consumption',
       'Physical_Activity_Level', 'BMI', 'Diabetes', 'Hypertension',
       'Cholesterol_Level', 'Resting_BP', 'Heart_Rate', 'Family_History',
       'Stress_Level', 'Heart_Attack_Risk'],
      dtype='object')

In [168]:
new_data_cleaned.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Heart_Attack_Risk
0,69,Female,1,0,Moderate,34.61,1,0,152.1,171,85,0,Moderate,Low
1,32,Male,0,0,Moderate,22.75,0,0,166.8,126,103,0,Low,Moderate
2,89,Male,0,1,Moderate,35.32,0,0,272.3,123,127,0,Low,Low
3,78,Male,0,1,Moderate,18.23,1,0,237.7,144,125,0,Low,Low
4,38,Female,1,0,Moderate,19.82,0,0,207.7,123,107,0,High,Moderate


In [169]:
new_data_cleaned['Systolic_BP'] = new_data_cleaned['Resting_BP']
new_data_cleaned['Diastolic_BP'] = (new_data_cleaned['Systolic_BP'] * 2/3).astype(int)
new_data_cleaned.drop(columns=['Resting_BP'], inplace=True)

In [170]:
new_data_cleaned.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Heart_Rate,Family_History,Stress_Level,Heart_Attack_Risk,Systolic_BP,Diastolic_BP
0,69,Female,1,0,Moderate,34.61,1,0,152.1,85,0,Moderate,Low,171,114
1,32,Male,0,0,Moderate,22.75,0,0,166.8,103,0,Low,Moderate,126,84
2,89,Male,0,1,Moderate,35.32,0,0,272.3,127,0,Low,Low,123,82
3,78,Male,0,1,Moderate,18.23,1,0,237.7,125,0,Low,Low,144,96
4,38,Female,1,0,Moderate,19.82,0,0,207.7,107,0,High,Moderate,123,82


In [171]:
new_data_cleaned.columns

Index(['Age', 'Gender', 'Smoking', 'Alcohol_Consumption',
       'Physical_Activity_Level', 'BMI', 'Diabetes', 'Hypertension',
       'Cholesterol_Level', 'Heart_Rate', 'Family_History', 'Stress_Level',
       'Heart_Attack_Risk', 'Systolic_BP', 'Diastolic_BP'],
      dtype='object')

In [172]:
new_data_cleaned.rename(columns={
    'Smoking': 'Smoking_History',
    'Diabetes': 'Diabetes_History',
    'Hypertension': 'Hypertension_History',
    'Physical_Activity_Level': 'Physical_Activity',
    'Stress_Level': 'Stress_Levels',
    'Heart_Attack_Risk': 'Heart_Attack_Occurrence'
}, inplace=True)

In [173]:
new_data_cleaned.columns

Index(['Age', 'Gender', 'Smoking_History', 'Alcohol_Consumption',
       'Physical_Activity', 'BMI', 'Diabetes_History', 'Hypertension_History',
       'Cholesterol_Level', 'Heart_Rate', 'Family_History', 'Stress_Levels',
       'Heart_Attack_Occurrence', 'Systolic_BP', 'Diastolic_BP'],
      dtype='object')

In [174]:
print(new_data_cleaned["Gender"].value_counts())
print(new_data_cleaned["Smoking_History"].value_counts())
print(new_data_cleaned["Diabetes_History"].value_counts())
print(new_data_cleaned["Hypertension_History"].value_counts())
print(new_data_cleaned["Alcohol_Consumption"].value_counts())
print(new_data_cleaned["Family_History"].value_counts())
print(new_data_cleaned["Heart_Attack_Occurrence"].value_counts())
print(new_data_cleaned["Physical_Activity"].value_counts())

Gender
Female    25086
Male      24914
Name: count, dtype: int64
Smoking_History
0    35079
1    14921
Name: count, dtype: int64
Diabetes_History
0    40058
1     9942
Name: count, dtype: int64
Hypertension_History
0    35131
1    14869
Name: count, dtype: int64
Alcohol_Consumption
0    29901
1    20099
Name: count, dtype: int64
Family_History
0    34983
1    15017
Name: count, dtype: int64
Heart_Attack_Occurrence
Low         25024
Moderate    14904
High        10072
Name: count, dtype: int64
Physical_Activity
Low         20061
Moderate    19994
High         9945
Name: count, dtype: int64


In [175]:
# Mapping rules
binary_mapping = {0: 'No', 1: 'Yes'}
alcohol_mapping = {0: 'Low', 1: 'High'}
heart_attack_mapping = {"Low": 'No', "Moderate": 'No', "High": 'Yes'}

# Apply mappings to new data
new_data_cleaned['Smoking_History'] = new_data_cleaned['Smoking_History'].replace(binary_mapping)
new_data_cleaned['Diabetes_History'] = new_data_cleaned['Diabetes_History'].replace(binary_mapping)
new_data_cleaned['Hypertension_History'] = new_data_cleaned['Hypertension_History'].replace(binary_mapping)
new_data_cleaned['Family_History'] = new_data_cleaned['Family_History'].replace(binary_mapping)

new_data_cleaned['Alcohol_Consumption'] = new_data_cleaned['Alcohol_Consumption'].replace(alcohol_mapping)
new_data_cleaned['Heart_Attack_Occurrence'] = new_data_cleaned['Heart_Attack_Occurrence'].replace(heart_attack_mapping)

In [176]:
print(new_data_cleaned["Gender"].value_counts())
print(new_data_cleaned["Smoking_History"].value_counts())
print(new_data_cleaned["Diabetes_History"].value_counts())
print(new_data_cleaned["Hypertension_History"].value_counts())
print(new_data_cleaned["Alcohol_Consumption"].value_counts())
print(new_data_cleaned["Family_History"].value_counts())
print(new_data_cleaned["Heart_Attack_Occurrence"].value_counts())
print(new_data_cleaned["Physical_Activity"].value_counts())

Gender
Female    25086
Male      24914
Name: count, dtype: int64
Smoking_History
No     35079
Yes    14921
Name: count, dtype: int64
Diabetes_History
No     40058
Yes     9942
Name: count, dtype: int64
Hypertension_History
No     35131
Yes    14869
Name: count, dtype: int64
Alcohol_Consumption
Low     29901
High    20099
Name: count, dtype: int64
Family_History
No     34983
Yes    15017
Name: count, dtype: int64
Heart_Attack_Occurrence
No     39928
Yes    10072
Name: count, dtype: int64
Physical_Activity
Low         20061
Moderate    19994
High         9945
Name: count, dtype: int64


In [177]:
final_data = new_data_cleaned[new_data_cleaned['Heart_Attack_Occurrence'] == 'Yes']
final_data.head()

Unnamed: 0,Age,Gender,Smoking_History,Alcohol_Consumption,Physical_Activity,BMI,Diabetes_History,Hypertension_History,Cholesterol_Level,Heart_Rate,Family_History,Stress_Levels,Heart_Attack_Occurrence,Systolic_BP,Diastolic_BP
5,41,Male,No,High,Moderate,36.11,No,No,271.2,119,No,Low,Yes,141,94
11,55,Female,No,Low,Moderate,31.32,No,No,280.2,105,No,Low,Yes,113,75
22,77,Female,Yes,High,High,32.45,No,No,175.8,113,No,Moderate,Yes,127,84
23,32,Male,No,Low,Moderate,31.26,No,No,284.4,87,Yes,High,Yes,114,76
24,79,Female,No,High,Moderate,19.82,No,No,202.8,84,No,Low,Yes,165,110


In [178]:
final_data.shape

(10072, 15)

In [179]:
df2=pd.read_csv("combined_dataset.csv")

In [180]:
df2.head()

Unnamed: 0,Age,Gender,Smoking_History,Diabetes_History,Hypertension_History,Cholesterol_Level,Physical_Activity,Diet_Quality,Alcohol_Consumption,Stress_Levels,BMI,Heart_Rate,Systolic_BP,Diastolic_BP,Family_History,Heart_Attack_Occurrence
0,56,Male,Yes,No,No,186.400209,Moderate,Poor,Low,3.644786,33.961349,72.301534,123.90209,85.682809,No,No
1,69,Male,No,No,No,185.136747,Low,Good,Low,3.384056,28.242873,57.45764,129.893306,73.524262,Yes,No
2,46,Male,Yes,No,No,210.696611,Low,Average,Moderate,3.810911,27.60121,64.658697,145.654901,71.994812,No,No
3,32,Female,No,No,No,211.165478,Moderate,Good,High,6.014878,23.717291,55.131469,131.78522,68.211333,No,No
4,60,Female,No,No,No,223.814253,High,Good,High,6.806883,19.771578,76.667917,100.694559,92.902489,No,No


In [181]:
df2.drop(columns=["Diet_Quality"],inplace=True)

In [182]:
df2["Alcohol_Consumption"] = df2["Alcohol_Consumption"].replace("Moderate", "High")

In [183]:
df2["Alcohol_Consumption"].value_counts()

Alcohol_Consumption
High    21362
Low      9098
Name: count, dtype: int64

In [184]:
df2.shape

(30460, 15)

In [185]:
combine_csv=pd.concat([df2,final_data])

In [186]:
combine_csv.head()

Unnamed: 0,Age,Gender,Smoking_History,Diabetes_History,Hypertension_History,Cholesterol_Level,Physical_Activity,Alcohol_Consumption,Stress_Levels,BMI,Heart_Rate,Systolic_BP,Diastolic_BP,Family_History,Heart_Attack_Occurrence
0,56,Male,Yes,No,No,186.400209,Moderate,Low,3.644786,33.961349,72.301534,123.90209,85.682809,No,No
1,69,Male,No,No,No,185.136747,Low,Low,3.384056,28.242873,57.45764,129.893306,73.524262,Yes,No
2,46,Male,Yes,No,No,210.696611,Low,High,3.810911,27.60121,64.658697,145.654901,71.994812,No,No
3,32,Female,No,No,No,211.165478,Moderate,High,6.014878,23.717291,55.131469,131.78522,68.211333,No,No
4,60,Female,No,No,No,223.814253,High,High,6.806883,19.771578,76.667917,100.694559,92.902489,No,No


In [187]:
combine_csv.shape

(40532, 15)

In [188]:
print(combine_csv["Gender"].value_counts())
print(combine_csv["Smoking_History"].value_counts())
print(combine_csv["Diabetes_History"].value_counts())
print(combine_csv["Hypertension_History"].value_counts())
print(combine_csv["Alcohol_Consumption"].value_counts())
print(combine_csv["Family_History"].value_counts())
print(combine_csv["Heart_Attack_Occurrence"].value_counts())
print(combine_csv["Physical_Activity"].value_counts())

Gender
Male      20408
Female    20124
Name: count, dtype: int64
Smoking_History
No     28072
Yes    12460
Name: count, dtype: int64
Diabetes_History
No     32114
Yes     8418
Name: count, dtype: int64
Hypertension_History
No     30185
Yes    10347
Name: count, dtype: int64
Alcohol_Consumption
High    25446
Low     15086
Name: count, dtype: int64
Family_History
No     28521
Yes    12011
Name: count, dtype: int64
Heart_Attack_Occurrence
No     27036
Yes    13496
Name: count, dtype: int64
Physical_Activity
Moderate    15975
Low         13104
High        11453
Name: count, dtype: int64


In [189]:
combine_csv.to_csv("final_dataset.csv")

In [190]:
combine_csv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40532 entries, 0 to 49995
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      40532 non-null  int64  
 1   Gender                   40532 non-null  object 
 2   Smoking_History          40532 non-null  object 
 3   Diabetes_History         40532 non-null  object 
 4   Hypertension_History     40532 non-null  object 
 5   Cholesterol_Level        40532 non-null  float64
 6   Physical_Activity        40532 non-null  object 
 7   Alcohol_Consumption      40532 non-null  object 
 8   Stress_Levels            40532 non-null  object 
 9   BMI                      40532 non-null  float64
 10  Heart_Rate               40532 non-null  float64
 11  Systolic_BP              40532 non-null  float64
 12  Diastolic_BP             40532 non-null  float64
 13  Family_History           40532 non-null  object 
 14  Heart_Attack_Occurrence  40