# Heart Patient Prognosis

Data Scientist:   __Gail Wittich__\
Email:      gwittich@optusnet.com.au \
Website:    www.linkedin.com/in/gail-wittich \
Copyright:  Copyright 2020, Gail Wittich 

### Preprocessing

### **Load Packages**

In [None]:
from google.colab import drive                         # for accessing files
import numpy as np                                     # for numeric computations
import pandas as pd                                    # for data analysis
import pickle                                          # for file reading and saving
from sklearn.preprocessing import LabelEncoder         # for converting categorical to numerical data
from sklearn.preprocessing import MinMaxScaler         # for normalising the data
from sklearn.preprocessing import StandardScaler       # for standardising the data

import warnings                                        # to ignore warnings
warnings.filterwarnings('ignore')

### **Load Data**

In [None]:
# mount the google drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Unpickle Training data
PP_train_df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/train_data_4_PP.pkl')

# Unpickle Testing data
PP_test_df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/new_test_data_4_PP.pkl')

### Preprocessing of Training Data

#### Feature Generation - Training Data

'Treated_with_drugs' column is a categorical column. In addition to single drug values, it has values representing combinations of drugs. It would be of value to know the impact of each drug alone. 

Split combined drug values into individual drugs and create dummies variables.

In [None]:
# split all the entries separated by space and create dummy variable
drugs_df = PP_train_df['Treated_with_drugs'].str.get_dummies(sep=' ') 
drugs_df.head()

Unnamed: 0,0,DX1,DX2,DX3,DX4,DX5,DX6
0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0


In [None]:
# concat the two dataframes 'drugs' and 'data'
PP_train_df = pd.concat([PP_train_df, drugs_df], axis=1)

# drop the column 'Treated_with_drugs' as its values are now represented in 
# features 0, DX1, DX2, DX3, DX4, DX5 and DX6.
PP_train_df = PP_train_df.drop('Treated_with_drugs', axis=1)

PP_train_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year,0,DX1,DX2,DX3,DX4,DX5,DX6
0,16201,47,8433,60.0,21.655523,NO,URBAN,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,0,1,0,0,0,0
1,9421,3,2972,2.0,28.852743,NO,RURAL,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0,0,0,0,0,0,0,1
2,16205,7,8608,20.0,26.179725,NO,RURAL,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,0,0,0,0,0,1
3,5582,31,10074,8.0,22.638945,NO,RURAL,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,0,0,1
4,20880,43,7462,53.0,21.326131,NO,RURAL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0


In [None]:
PP_train_df.Patient_Smoker.value_counts()

NO         13214
YES         9875
UNKNOWN        8
Name: Patient_Smoker, dtype: int64

'Patient_Smoker' is also a categorical column. To create dummies for it 'Cannot say' needs to be addressed. Fill with Mode ('NO')

There are several ways to deal with the category 'Cannot say'. In this situation the safest thing is to consider it as missing data and replace those values '0' rather than the mode value of the column.

In [None]:
print(PP_train_df[(PP_train_df.Patient_Smoker == 'UNKNOWN')]["Patient_ID"])

1239     12508
2137     12509
3346     12511
6257     12512
12863    12510
13192    12514
14574    12504
19669    12505
Name: Patient_ID, dtype: int64


In [None]:
# Check for other records for those patients whose smoker status is unknown
PP_train_df[(PP_train_df.Patient_ID == 12508) | (PP_train_df.Patient_ID == 12509)
| (PP_train_df.Patient_ID == 12511) | (PP_train_df.Patient_ID == 12512) 
| (PP_train_df.Patient_ID == 12510) | (PP_train_df.Patient_ID == 12514)
| (PP_train_df.Patient_ID == 12504) | (PP_train_df.Patient_ID == 12505)]

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year,0,DX1,DX2,DX3,DX4,DX5,DX6
1239,33007,0,12508,33.235831,1.4606,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
2137,33008,0,12509,33.235831,1.2925,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
3346,33010,0,12511,33.235831,1.5701,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
6257,33011,0,12512,33.235831,1.9896,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
12863,33009,0,12510,33.235831,1.145,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
13192,33013,0,12514,33.235831,1.0893,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
14574,33003,0,12504,33.235831,1.8282,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0
19669,33004,0,12505,33.235831,1.929,UNKNOWN,RURAL,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0,0,0,0,0,0


There are no other entries to assist with the information for those patients.

In [None]:
PP_train_df.Patient_Smoker.value_counts()

NO         13214
YES         9875
UNKNOWN        8
Name: Patient_Smoker, dtype: int64

In [None]:
# change the values 'UNKNOWN' to '0'
PP_train_df.Patient_Smoker[PP_train_df['Patient_Smoker'] == "UNKNOWN"] = '0'
# PP_train_df.Patient_Smoker[PP_train_df['Patient_Smoker'] == "Cannot say"] = 'NO'

In [None]:
PP_train_df.Patient_Smoker.value_counts()

NO     13214
YES     9875
0          8
Name: Patient_Smoker, dtype: int64

#### Data Encoding - Training Data

Convert the remaining categorical column to numerical using get_dummies() function of pandas (i.e. one hot encoding).

In [None]:
PP_train_df = pd.get_dummies(PP_train_df, columns=['Patient_Smoker', 'Patient_Rural_Urban'])

In [None]:
PP_train_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year,0,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_0,Patient_Smoker_NO,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,16201,47,8433,60.0,21.655523,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,0,1,0,0,0,0,0,1,0,0,1
1,9421,3,2972,2.0,28.852743,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0,0,0,0,0,0,0,1,0,1,0,1,0
2,16205,7,8608,20.0,26.179725,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,0,0,0,0,0,1,0,1,0,1,0
3,5582,31,10074,8.0,22.638945,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,0,0,1,0,1,0,1,0
4,20880,43,7462,53.0,21.326131,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,1,0,0,0,0,0,0,1,0,1,0


In [None]:
PP_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23097 entries, 0 to 23096
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Patient_Age                23097 non-null  float64
 4   Patient_Body_Mass_Index    23097 non-null  float64
 5   A                          23097 non-null  float64
 6   B                          23097 non-null  float64
 7   C                          23097 non-null  float64
 8   D                          23097 non-null  float64
 9   E                          23097 non-null  float64
 10  F                          23097 non-null  float64
 11  Z                          23097 non-null  float64
 12  Number_of_prev_cond        23097 non-null  float64
 13  Survived_1_year            23097 non-null  int

There are now no missing values and all the data is of numerical type.

There are two ID columns - 'ID_Patient_Care_Situation' and 'Patient_ID'. We can Review with a view to removing these columns if these are do not provide any benefit.  and there is not any id **repeated** Check these two ID columns.

In [None]:
print('ID_Patient_Care_Situation unique values: ', PP_train_df.ID_Patient_Care_Situation.nunique())
print('Patient_ID unique values: ', PP_train_df.Patient_ID.nunique())

ID_Patient_Care_Situation unique values:  23097
Patient_ID unique values:  10599


There are 23097 unique 'ID_Patient_Care_Situation', the same no of total records in the Training data.

There are only 10570 unique values in the feature 'Patient_ID'. This means there were some patients who presented two or more times to the hospital for treatment (which is likely). And the same patient will have different caring condition for different presentations (visites to the hospital). 

The combination of 'ID_Patient_Care_Situation' and 'Patient_ID' represent who and how many repeat patients there were. Therefore:
- There is useful information in the feature 'ID_Patient_Care_Situation' This feature will be kept. (ie identified 
- Dropping 'Patient_ID' feature means losing information relating to a repeat patient. This feature will be kept.

In [None]:
# Reorder features so Target is last feature in dataframe
PP_train_df.columns

Index(['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'A', 'B', 'C', 'D', 'E', 'F',
       'Z', 'Number_of_prev_cond', 'Survived_1_year', '0', 'DX1', 'DX2', 'DX3',
       'DX4', 'DX5', 'DX6', 'Patient_Smoker_0', 'Patient_Smoker_NO',
       'Patient_Smoker_YES', 'Patient_Rural_Urban_RURAL',
       'Patient_Rural_Urban_URBAN'],
      dtype='object')

In [None]:
# PP_train_df = PP_train_df[['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
#        'Patient_Age', 'Patient_Body_Mass_Index', 'A', 'B', 'C', 'D', 'E', 'F',
#        'Z', 'Number_of_prev_cond', '0', 'DX1', 'DX2', 'DX3',
#        'DX4', 'DX5', 'DX6', 'Patient_Smoker_NO', 'Patient_Smoker_0',
#        'Patient_Smoker_YES', 'Patient_Rural_Urban_RURAL',
#        'Patient_Rural_Urban_URBAN', 'Survived_1_year']]
# 
# PP_train_df.columns

In [None]:
PP_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23097 entries, 0 to 23096
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Patient_Age                23097 non-null  float64
 4   Patient_Body_Mass_Index    23097 non-null  float64
 5   A                          23097 non-null  float64
 6   B                          23097 non-null  float64
 7   C                          23097 non-null  float64
 8   D                          23097 non-null  float64
 9   E                          23097 non-null  float64
 10  F                          23097 non-null  float64
 11  Z                          23097 non-null  float64
 12  Number_of_prev_cond        23097 non-null  float64
 13  Survived_1_year            23097 non-null  int

### Preprocessing of Test Data

In [None]:
# take a look how the new test data look like
PP_test_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,24206,35,4640,DX5,65,20.710365,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,32827,30,3214,DX1,2,24.250219,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
2,3694,46,3564,DX6,1,27.139276,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7164,44,5176,DX1,29,29.191759,NO,RURAL,Stable,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0
4,1259,30,1101,DX5,51,20.844146,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


#### Feature Generation - Test Data (as for Training Data)

In [None]:
# split all the entries separated by space and create dummy variable
drugs = PP_test_df['Treated_with_drugs'].str.get_dummies(sep=' ') # split all the entries
drugs.head()

Unnamed: 0,DX1,DX2,DX3,DX4,DX5,DX6
0,0,0,0,0,1,0
1,1,0,0,0,0,0
2,0,0,0,0,0,1
3,1,0,0,0,0,0
4,0,0,0,0,1,0


In [None]:
# concat the two dataframes 'drugs' and 'data'
PP_test_df = pd.concat([PP_test_df, drugs], axis=1)     

# dropping the column 'Treated_with_drugs' 
PP_test_df = PP_test_df.drop('Treated_with_drugs', axis=1)    

PP_test_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,DX1,DX2,DX3,DX4,DX5,DX6
0,24206,35,4640,65,20.710365,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,1,0
1,32827,30,3214,2,24.250219,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1,0,0,0,0,0
2,3694,46,3564,1,27.139276,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,1
3,7164,44,5176,29,29.191759,NO,RURAL,Stable,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,1,0,0,0,0,0
4,1259,30,1101,51,20.844146,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,1,0


In [None]:
PP_test_df.Patient_Smoker.value_counts()

NO     5373
YES    3957
Name: Patient_Smoker, dtype: int64

This data does not have value as 'Cannot say' in 'Patient_Smoker' column

#### Data Encoding - Test Data (as for Training Data)

Convert the categorical features to numerical data using get_dummies() function of pandas (i.e. one hot encoding).

In [None]:
PP_test_df = pd.get_dummies(PP_test_df, columns=['Patient_Smoker', 'Patient_Rural_Urban'])

In [None]:
PP_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9330 entries, 0 to 9329
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9330 non-null   int64  
 1   Diagnosed_Condition        9330 non-null   int64  
 2   Patient_ID                 9330 non-null   int64  
 3   Patient_Age                9330 non-null   int64  
 4   Patient_Body_Mass_Index    9330 non-null   float64
 5   Patient_mental_condition   9330 non-null   object 
 6   A                          9330 non-null   float64
 7   B                          9330 non-null   float64
 8   C                          9330 non-null   float64
 9   D                          9330 non-null   float64
 10  E                          9330 non-null   float64
 11  F                          9330 non-null   float64
 12  Z                          9330 non-null   float64
 13  Number_of_prev_cond        9330 non-null   float

In [None]:
# Drop 'Patient_mental_condition' from Test Data
PP_test_df.drop(['Patient_mental_condition'], axis=1, inplace = True)
PP_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9330 entries, 0 to 9329
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9330 non-null   int64  
 1   Diagnosed_Condition        9330 non-null   int64  
 2   Patient_ID                 9330 non-null   int64  
 3   Patient_Age                9330 non-null   int64  
 4   Patient_Body_Mass_Index    9330 non-null   float64
 5   A                          9330 non-null   float64
 6   B                          9330 non-null   float64
 7   C                          9330 non-null   float64
 8   D                          9330 non-null   float64
 9   E                          9330 non-null   float64
 10  F                          9330 non-null   float64
 11  Z                          9330 non-null   float64
 12  Number_of_prev_cond        9330 non-null   float64
 13  DX1                        9330 non-null   int64

There are no missing values now and all the data are of numerical types.

In [None]:
PP_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9330 entries, 0 to 9329
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9330 non-null   int64  
 1   Diagnosed_Condition        9330 non-null   int64  
 2   Patient_ID                 9330 non-null   int64  
 3   Patient_Age                9330 non-null   int64  
 4   Patient_Body_Mass_Index    9330 non-null   float64
 5   A                          9330 non-null   float64
 6   B                          9330 non-null   float64
 7   C                          9330 non-null   float64
 8   D                          9330 non-null   float64
 9   E                          9330 non-null   float64
 10  F                          9330 non-null   float64
 11  Z                          9330 non-null   float64
 12  Number_of_prev_cond        9330 non-null   float64
 13  DX1                        9330 non-null   int64

In [None]:
# Insert columns created during Training Data encoding 
# Test data had no missing values in the related features therefore these features were not generated.

PP_test_df.insert(22, 'Patient_Smoker_0', 0)
PP_test_df.insert(14, '0', 0)

In [None]:
# Reorder Test data columns to match Training data
PP_test_df = PP_test_df[['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'A', 'B', 'C', 'D', 'E', 'F',
       'Z', 'Number_of_prev_cond', '0', 'DX1', 'DX2', 'DX3',
       'DX4', 'DX5', 'DX6', 'Patient_Smoker_NO', 'Patient_Smoker_0',
       'Patient_Smoker_YES', 'Patient_Rural_Urban_RURAL',
       'Patient_Rural_Urban_URBAN']]

PP_test_df.columns

Index(['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'A', 'B', 'C', 'D', 'E', 'F',
       'Z', 'Number_of_prev_cond', '0', 'DX1', 'DX2', 'DX3', 'DX4', 'DX5',
       'DX6', 'Patient_Smoker_NO', 'Patient_Smoker_0', 'Patient_Smoker_YES',
       'Patient_Rural_Urban_RURAL', 'Patient_Rural_Urban_URBAN'],
      dtype='object')

Compare Dataframe format for Training and Test dataframes

In [None]:
PP_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23097 entries, 0 to 23096
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  23097 non-null  int64  
 1   Diagnosed_Condition        23097 non-null  int64  
 2   Patient_ID                 23097 non-null  int64  
 3   Patient_Age                23097 non-null  float64
 4   Patient_Body_Mass_Index    23097 non-null  float64
 5   A                          23097 non-null  float64
 6   B                          23097 non-null  float64
 7   C                          23097 non-null  float64
 8   D                          23097 non-null  float64
 9   E                          23097 non-null  float64
 10  F                          23097 non-null  float64
 11  Z                          23097 non-null  float64
 12  Number_of_prev_cond        23097 non-null  float64
 13  Survived_1_year            23097 non-null  int

In [None]:
PP_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9330 entries, 0 to 9329
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9330 non-null   int64  
 1   Diagnosed_Condition        9330 non-null   int64  
 2   Patient_ID                 9330 non-null   int64  
 3   Patient_Age                9330 non-null   int64  
 4   Patient_Body_Mass_Index    9330 non-null   float64
 5   A                          9330 non-null   float64
 6   B                          9330 non-null   float64
 7   C                          9330 non-null   float64
 8   D                          9330 non-null   float64
 9   E                          9330 non-null   float64
 10  F                          9330 non-null   float64
 11  Z                          9330 non-null   float64
 12  Number_of_prev_cond        9330 non-null   float64
 13  0                          9330 non-null   int64

In [None]:
# Cast Test data, 'Patient_Age' to float64 AND 'Patient_Smoker_UNKNOWN' to uint8
PP_test_df = PP_test_df.astype({"Patient_Age":'float64', 'Patient_Smoker_0':'uint8'}) 
  
# print the data type of all columns after change 
PP_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9330 entries, 0 to 9329
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID_Patient_Care_Situation  9330 non-null   int64  
 1   Diagnosed_Condition        9330 non-null   int64  
 2   Patient_ID                 9330 non-null   int64  
 3   Patient_Age                9330 non-null   float64
 4   Patient_Body_Mass_Index    9330 non-null   float64
 5   A                          9330 non-null   float64
 6   B                          9330 non-null   float64
 7   C                          9330 non-null   float64
 8   D                          9330 non-null   float64
 9   E                          9330 non-null   float64
 10  F                          9330 non-null   float64
 11  Z                          9330 non-null   float64
 12  Number_of_prev_cond        9330 non-null   float64
 13  0                          9330 non-null   int64

Compare summary of Training data to Test Data

In [None]:
PP_train_df.isnull().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Patient_Age                  0
Patient_Body_Mass_Index      0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
Survived_1_year              0
0                            0
DX1                          0
DX2                          0
DX3                          0
DX4                          0
DX5                          0
DX6                          0
Patient_Smoker_0             0
Patient_Smoker_NO            0
Patient_Smoker_YES           0
Patient_Rural_Urban_RURAL    0
Patient_Rural_Urban_URBAN    0
dtype: int64

In [None]:
PP_test_df.isnull().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Patient_Age                  0
Patient_Body_Mass_Index      0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
0                            0
DX1                          0
DX2                          0
DX3                          0
DX4                          0
DX5                          0
DX6                          0
Patient_Smoker_NO            0
Patient_Smoker_0             0
Patient_Smoker_YES           0
Patient_Rural_Urban_RURAL    0
Patient_Rural_Urban_URBAN    0
dtype: int64

In [None]:
PP_train_df.nunique()

ID_Patient_Care_Situation    23097
Diagnosed_Condition             53
Patient_ID                   10599
Patient_Age                     68
Patient_Body_Mass_Index      10599
A                                2
B                                2
C                                2
D                                2
E                                2
F                                2
Z                                2
Number_of_prev_cond              6
Survived_1_year                  2
0                                2
DX1                              2
DX2                              2
DX3                              2
DX4                              2
DX5                              2
DX6                              2
Patient_Smoker_0                 2
Patient_Smoker_NO                2
Patient_Smoker_YES               2
Patient_Rural_Urban_RURAL        2
Patient_Rural_Urban_URBAN        2
dtype: int64

In [None]:
PP_test_df.nunique()

ID_Patient_Care_Situation    9330
Diagnosed_Condition            52
Patient_ID                   6486
Patient_Age                    67
Patient_Body_Mass_Index      6486
A                               2
B                               2
C                               2
D                               2
E                               2
F                               2
Z                               1
Number_of_prev_cond             5
0                               1
DX1                             2
DX2                             2
DX3                             2
DX4                             2
DX5                             2
DX6                             2
Patient_Smoker_NO               2
Patient_Smoker_0                1
Patient_Smoker_YES              2
Patient_Rural_Urban_RURAL       2
Patient_Rural_Urban_URBAN       2
dtype: int64

Data Normalisation - Test Data

In [None]:

# define min max scaler
scaler = MinMaxScaler()
# transform data
PP_test_df_norm = pd.DataFrame(scaler.fit_transform(PP_test_df), columns=PP_test_df.columns)

PP_test_df_norm


Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,0,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_NO,Patient_Smoker_0,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,0.733883,0.666667,0.371079,0.984848,0.285403,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.995300,0.568627,0.256962,0.030303,0.557716,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.111893,0.882353,0.284971,0.015152,0.779964,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.217114,0.843137,0.413972,0.439394,0.937856,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.038056,0.568627,0.087868,0.772727,0.295695,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,0.040512,0.568627,0.695102,0.666667,0.240706,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
9326,0.074262,0.607843,0.857234,0.227273,0.242747,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
9327,0.691067,0.803922,0.300816,0.181818,0.966954,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
9328,0.075475,0.666667,0.368358,0.333333,0.053958,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


Data Standardisation - Test Data

In [None]:

# define standard scaler
scaler = StandardScaler()
# transform data
PP_test_df_stand= scaler.fit_transform(PP_test_df)

PP_test_df_stand


array([[ 0.81735299,  0.55390846, -0.44953511, ..., -0.85817252,
         0.65198174, -0.65198174],
       [ 1.72522519,  0.22059022, -0.84442585, ..., -0.85817252,
        -1.53378529,  1.53378529],
       [-1.34275311,  1.28720859, -0.74750315, ..., -0.85817252,
        -1.53378529,  1.53378529],
       ...,
       [ 0.66865614,  1.020554  , -0.6926726 , ..., -0.85817252,
         0.65198174, -0.65198174],
       [-1.46922968,  0.55390846, -0.45895045, ...,  1.16526686,
         0.65198174, -0.65198174],
       [ 1.11548386,  1.08721764,  0.07329359, ..., -0.85817252,
        -1.53378529,  1.53378529]])

Data Normalisation - Test Data

In [None]:
PP_test_df

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,0,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_NO,Patient_Smoker_0,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,24206,35,4640,65.0,20.710365,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,1,0,1,0,0,1,0
1,32827,30,3214,2.0,24.250219,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,1,0,0,0,0,0,1,0,0,0,1
2,3694,46,3564,1.0,27.139276,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,0,1
3,7164,44,5176,29.0,29.191759,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0,1,0,0,0,0,0,1,0,0,1,0
4,1259,30,1101,51.0,20.844146,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,1340,30,8689,44.0,20.129337,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,1,0
9326,2453,32,10715,15.0,20.155865,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,0,0,1,1,0
9327,22794,42,3762,12.0,29.570005,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,1,0
9328,2493,35,4606,22.0,17.701751,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,1,0,0,1,1,0


In [None]:

# define min max scaler
scaler = MinMaxScaler()
# transform data
PP_test_df_norm = pd.DataFrame(scaler.fit_transform(PP_test_df), columns=PP_test_df.columns)

PP_test_df_norm


Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,0,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_NO,Patient_Smoker_0,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,0.733883,0.666667,0.371079,0.984848,0.285403,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.995300,0.568627,0.256962,0.030303,0.557716,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.111893,0.882353,0.284971,0.015152,0.779964,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.217114,0.843137,0.413972,0.439394,0.937856,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.038056,0.568627,0.087868,0.772727,0.295695,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,0.040512,0.568627,0.695102,0.666667,0.240706,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
9326,0.074262,0.607843,0.857234,0.227273,0.242747,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
9327,0.691067,0.803922,0.300816,0.181818,0.966954,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
9328,0.075475,0.666667,0.368358,0.333333,0.053958,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


Data Standardisation - Test Data

In [None]:

# define standard scaler
scaler = StandardScaler()
# transform data
PP_test_df_stand= scaler.fit_transform(PP_test_df)

PP_test_df_stand


array([[ 0.81735299,  0.55390846, -0.44953511, ..., -0.85817252,
         0.65198174, -0.65198174],
       [ 1.72522519,  0.22059022, -0.84442585, ..., -0.85817252,
        -1.53378529,  1.53378529],
       [-1.34275311,  1.28720859, -0.74750315, ..., -0.85817252,
        -1.53378529,  1.53378529],
       ...,
       [ 0.66865614,  1.020554  , -0.6926726 , ..., -0.85817252,
         0.65198174, -0.65198174],
       [-1.46922968,  0.55390846, -0.45895045, ...,  1.16526686,
         0.65198174, -0.65198174],
       [ 1.11548386,  1.08721764,  0.07329359, ..., -0.85817252,
        -1.53378529,  1.53378529]])

In [None]:
PP_test_df


Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Patient_Age,Patient_Body_Mass_Index,A,B,C,D,E,F,Z,Number_of_prev_cond,0,DX1,DX2,DX3,DX4,DX5,DX6,Patient_Smoker_NO,Patient_Smoker_0,Patient_Smoker_YES,Patient_Rural_Urban_RURAL,Patient_Rural_Urban_URBAN
0,24206,35,4640,65.0,20.710365,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,0,0,0,0,1,0,1,0,0,1,0
1,32827,30,3214,2.0,24.250219,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0,1,0,0,0,0,0,1,0,0,0,1
2,3694,46,3564,1.0,27.139276,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,0,1
3,7164,44,5176,29.0,29.191759,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0,1,0,0,0,0,0,1,0,0,1,0
4,1259,30,1101,51.0,20.844146,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,1340,30,8689,44.0,20.129337,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,1,0
9326,2453,32,10715,15.0,20.155865,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,0,0,1,1,0
9327,22794,42,3762,12.0,29.570005,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,1,1,0,0,1,0
9328,2493,35,4606,22.0,17.701751,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,1,0,0,1,1,0


In [None]:
PP_test_df.columns

Index(['ID_Patient_Care_Situation', 'Diagnosed_Condition', 'Patient_ID',
       'Patient_Age', 'Patient_Body_Mass_Index', 'A', 'B', 'C', 'D', 'E', 'F',
       'Z', 'Number_of_prev_cond', '0', 'DX1', 'DX2', 'DX3', 'DX4', 'DX5',
       'DX6', 'Patient_Smoker_NO', 'Patient_Smoker_0', 'Patient_Smoker_YES',
       'Patient_Rural_Urban_RURAL', 'Patient_Rural_Urban_URBAN'],
      dtype='object')

In [None]:
PP_train_df['Patient_Smoker_0'].nunique()
# PP_train_df.Patient_ID.nunique()

2

In [None]:
PP_train_df.groupby('Patient_Smoker_0').size()

Patient_Smoker_0
0    23089
1        8
dtype: int64

### **Save Data**

In [None]:
# pickle Training data
PP_train_df.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/train_data_4_model.pkl')

# pickle Testing data
PP_test_df.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/new_test_data_4_model.pkl')