# Heart Patient Prognosis

Data Scientist:   __Gail Wittich__\
Email:      gwittich@optusnet.com.au \
Website:    www.linkedin.com/in/gail-wittich \
Copyright:  Copyright 2020, Gail Wittich 

### **Load Packages**

In [None]:
from google.colab import drive                         # for accessing files
import numpy as np                                     # for numeric computations
import pandas as pd                                    # for data analysis
import pickle                                          # for file reading and saving

import warnings                                        # to ignore warnings
warnings.filterwarnings('ignore')

### **Load Data**

In [None]:
# mount the google drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Load Training data
training_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/Training_set_advc.csv')

 # training_df = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Training_set_advc.csv')

In [None]:
# Load Test data
new_test_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/Testing_set_advc.csv')

# new_test_df = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/pharma_data/Testing_set_advc.csv")

### **Data Preparation - Training Data**

In [None]:
training_df.info

<bound method DataFrame.info of        ID_Patient_Care_Situation  ...  Survived_1_year
0                          16201  ...                1
1                           9421  ...                0
2                          16205  ...                1
3                           5582  ...                0
4                          20880  ...                1
...                          ...  ...              ...
25074                       3233  ...                1
25075                      31394  ...                0
25076                      28315  ...                1
25077                       1338  ...                1
25078                      16072  ...                1

[25079 rows x 18 columns]>

In [None]:
# inspect the first 5 rows of data - feature names, data types amd values
training_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,16201,47,8433,DX2,60,21.655523,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
1,9421,3,2972,DX6,2,28.852743,NO,RURAL,Stable,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0
2,16205,7,8608,Dx6,20,26.179725,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
3,5582,31,10074,dx6,8,22.638945,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
4,20880,43,7462,dx1,53,21.326131,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


The feature 'Treated_with_drugs' has multiple values representing the same drug i.e. DX6 & Dx6 and dx6.

**Check for duplicate data**

In [None]:
training_df.duplicated().sum()

1982

In [None]:
duplicate = training_df[training_df.duplicated(['ID_Patient_Care_Situation', 'Patient_ID'])]  
print("Duplicate Rows :") 
duplicate

Duplicate Rows :


Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
23097,30896,18,10023,DX6,16,29.917028,NO,URBAN,Stable,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1
23098,10604,29,7392,Dx6,1,27.302076,NO,URBAN,Stable,,,,,,,,,0
23099,25016,49,8356,DX5,13,22.847150,NO,RURAL,Stable,1.0,1.0,0.0,1.0,0.0,0.0,0.0,3.0,1
23100,8412,2,5650,Dx6,18,28.955220,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
23101,754,13,7831,DX3 DX5,0,28.214876,NO,RURAL,Stable,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25074,3233,2,2491,Dx3 dx4,33,20.825297,NO,RURAL,Stable,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,1
25075,31394,21,2806,DX2,60,22.679810,YES,RURAL,Stable,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0
25076,28315,18,11733,DX4 DX5,51,19.952023,YES,RURAL,Stable,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
25077,1338,4,8577,dx5,21,20.681303,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [None]:
training_df = training_df.drop_duplicates()

#### Check, correct and corfirm Treated_with_drugs values

In [None]:
training_df.Treated_with_drugs.value_counts()

DX6                 4446
DX6                 2199
DX5                 1038
DX2                 1008
DX4                  968
                    ... 
dx1 dx3 dx4 dx5        1
dx1 dx2 dx3 dx5        1
Dx1 dx2 dx5            1
Dx1 dx2 dx3 dx4        1
Dx3 dx4 dx5            1
Name: Treated_with_drugs, Length: 182, dtype: int64

Convert all the values in the feature 'Treated_with_drugs' to upper case.

In [None]:
training_df['Treated_with_drugs'] = training_df['Treated_with_drugs'].str.upper()

In [None]:
training_df.Treated_with_drugs.value_counts()

DX6                     5807
DX6                     2830
DX5                     1318
DX2                     1282
DX4                     1260
                        ... 
DX2 DX3 DX4 DX5            6
DX1 DX2 DX4 DX5            5
DX1 DX2 DX3 DX4 DX5        2
DX1 DX2 DX3 DX4 DX5        1
DX1 DX2 DX3 DX5            1
Name: Treated_with_drugs, Length: 64, dtype: int64

Remove trailing spaces from drug types to eliminate multiple entries of the same drug. i.e.
- 'DX6', count: 6286
- 'DX6__', count: 3069 (ie trailing spaces)

In [None]:
training_df['Treated_with_drugs'] = training_df['Treated_with_drugs'].str.strip()

In [None]:
# check that DX6 now has count 9355 (= 6289 + 3069) and there are no other 'duplicate' entries 
training_df.Treated_with_drugs.value_counts()

DX6                    8637
DX5                    1921
DX2                    1901
DX4                    1857
DX1                    1828
DX3                    1808
DX1 DX2                 424
DX2 DX4                 420
DX2 DX3                 420
DX3 DX4                 415
DX3 DX5                 414
DX4 DX5                 409
DX2 DX5                 406
DX1 DX5                 406
DX1 DX4                 398
DX1 DX3                 397
DX1 DX2 DX4             100
DX1 DX3 DX4             100
DX1 DX2 DX5              98
DX2 DX3 DX5              96
DX3 DX4 DX5              96
DX2 DX3 DX4              95
DX1 DX3 DX5              91
DX1 DX4 DX5              87
DX1 DX2 DX3              86
DX2 DX4 DX5              83
DX1 DX2 DX3 DX4          23
DX1 DX3 DX4 DX5          21
DX1 DX2 DX4 DX5          19
DX2 DX3 DX4 DX5          18
DX1 DX2 DX3 DX5          12
DX1 DX2 DX3 DX4 DX5       3
Name: Treated_with_drugs, dtype: int64

#### Check, correct and corfirm Patient_Smoker values

In [None]:
training_df.Patient_Smoker.value_counts()

NO             9653
YES            7249
NO             3561
YES            2509
YESS             92
YESS             25
Cannot say        4
Cannot say        2
CANNOT SAY        1
CANNOT SAY        1
Name: Patient_Smoker, dtype: int64

There should only be 3 different values - NO, YES, and UNKNOWN

In [None]:
# clean up 'Patient_Smoker' values
def smoker(r):
  if (r == "NO") or (r == "NO "):
    return 'NO'
  elif (r == "YES") or (r == "YES ") or (r == "YESS") or (r == "YESS "):
    return 'YES'
  else:
    return 'UNKNOWN'

training_df.Patient_Smoker = training_df.Patient_Smoker.apply(smoker)  # Applying the function to all the entries of Patient_Smoker feature

In [None]:
training_df.Patient_Smoker.value_counts()

NO         13214
YES         9875
UNKNOWN        8
Name: Patient_Smoker, dtype: int64

#### Check, correct and corfirm Patient_Rural values

In [None]:
training_df.Patient_Rural_Urban.value_counts()

RURAL    16116
URBAN     6981
Name: Patient_Rural_Urban, dtype: int64

#### Check Patient_mental_condition values

In [None]:
training_df.Patient_mental_condition.value_counts()

Stable    23097
Name: Patient_mental_condition, dtype: int64

As there is only one value in the 'Patient_mental_condition' feature, i.e. all patients are stable, this feature could be removed from the data set because it does not provide any insight and wont affect the predictions.  However, this limits the value of the model for future data that may have patients that are not 'Stable'.

In [None]:
training_df.drop(['Patient_mental_condition'], axis=1, inplace=True)
training_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,A,B,C,D,E,F,Z,Number_of_prev_cond,Survived_1_year
0,16201,47,8433,DX2,60,21.655523,NO,URBAN,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
1,9421,3,2972,DX6,2,28.852743,NO,RURAL,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0
2,16205,7,8608,DX6,20,26.179725,NO,RURAL,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1
3,5582,31,10074,DX6,8,22.638945,NO,RURAL,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0
4,20880,43,7462,DX1,53,21.326131,NO,RURAL,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


### **Data Preparation - New Test Data (as for Training Data)**

In [None]:
# inspect the first 5 rows of data - feature names, data types amd values
new_test_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,24206,35,4640,DX5,65,20.710365,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,32827,30,3214,dx1,2,24.250219,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
2,3694,46,3564,dx6,1,27.139276,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7164,44,5176,DX1,29,29.191759,NO,RURAL,Stable,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0
4,1259,30,1101,DX5,51,20.844146,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


The New Test data is the same format as the Training data had origonally except for the Target feature, 'Survived_1_year' as expected.

In [None]:
new_test_df.Treated_with_drugs.value_counts()

DX6                1856
DX6                 789
DX1                 416
DX5                 407
DX2                 405
                   ... 
Dx1 dx3 dx5           1
Dx1 dx2 dx3 dx5       1
Dx2 dx4 dx5           1
Dx2 dx3 dx4           1
Dx1 dx4 dx5           1
Name: Treated_with_drugs, Length: 173, dtype: int64

Convert all the values in the feature - 'Treated_with_drugs' to upper case

In [None]:
new_test_df['Treated_with_drugs'] = new_test_df['Treated_with_drugs'].str.upper()

In [None]:
new_test_df.Treated_with_drugs.value_counts()

DX6                     2378
DX6                     1064
DX3                      532
DX1                      525
DX2                      516
                        ... 
DX2 DX3 DX4 DX5            2
DX1 DX2 DX3 DX4            2
DX1 DX2 DX3 DX4 DX5        1
DX1 DX2 DX4 DX5            1
DX1 DX2 DX3 DX4 DX5        1
Name: Treated_with_drugs, Length: 64, dtype: int64

Remove trailing spaces from drug types to eliminate multiple entries of the same drug. i.e.

'DX6', count: 2378
'DX6__', count: 2378 (ie trailing spaces)

In [None]:
new_test_df['Treated_with_drugs'] = new_test_df['Treated_with_drugs'].str.strip()

In [None]:
# check that DX6 now has count 9355 (= 6289 + 3069) and there are no other 'duplicate' entries 
new_test_df.Treated_with_drugs.value_counts()

DX6                    3442
DX3                     773
DX1                     766
DX5                     758
DX2                     752
DX4                     730
DX3 DX4                 184
DX1 DX2                 183
DX2 DX4                 179
DX4 DX5                 178
DX1 DX3                 175
DX1 DX4                 164
DX2 DX5                 159
DX2 DX3                 159
DX1 DX5                 159
DX3 DX5                 156
DX1 DX2 DX5              47
DX1 DX2 DX3              45
DX3 DX4 DX5              37
DX2 DX4 DX5              36
DX1 DX2 DX4              35
DX2 DX3 DX4              35
DX2 DX3 DX5              34
DX1 DX3 DX4              33
DX1 DX3 DX5              33
DX1 DX4 DX5              29
DX1 DX3 DX4 DX5          13
DX2 DX3 DX4 DX5          12
DX1 DX2 DX3 DX4           8
DX1 DX2 DX3 DX5           8
DX1 DX2 DX4 DX5           6
DX1 DX2 DX3 DX4 DX5       2
Name: Treated_with_drugs, dtype: int64

In [None]:
new_test_df.Patient_Smoker.value_counts()

NO       3949
YES      2879
NO       1424
YES      1027
YESS       36
YESS       15
Name: Patient_Smoker, dtype: int64

There should only be 3 different values - NO, YES, and UNKNOWN

In [None]:
# Apply the previously defined function 'smoker' to 'Patient_Smoker' feature
new_test_df.Patient_Smoker = new_test_df.Patient_Smoker.apply(smoker)

In [None]:
new_test_df.Patient_Smoker.value_counts()

NO     5373
YES    3957
Name: Patient_Smoker, dtype: int64

In [None]:
new_test_df.Patient_Rural_Urban.value_counts()

RURAL    6547
URBAN    2783
Name: Patient_Rural_Urban, dtype: int64

In [None]:
new_test_df.Patient_mental_condition.value_counts()

Stable    9330
Name: Patient_mental_condition, dtype: int64

Testing_data records ALL have the same value for 'Patient_mental_condition'. i.e. 'stable'. This conmfirms that there is no value in it at this time.
Drop 'Patient_mental_condition' becasue it was dropped from Training Data.

In [None]:
new_test_df.head()

Unnamed: 0,ID_Patient_Care_Situation,Diagnosed_Condition,Patient_ID,Treated_with_drugs,Patient_Age,Patient_Body_Mass_Index,Patient_Smoker,Patient_Rural_Urban,Patient_mental_condition,A,B,C,D,E,F,Z,Number_of_prev_cond
0,24206,35,4640,DX5,65,20.710365,NO,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
1,32827,30,3214,DX1,2,24.250219,NO,URBAN,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
2,3694,46,3564,DX6,1,27.139276,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7164,44,5176,DX1,29,29.191759,NO,RURAL,Stable,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0
4,1259,30,1101,DX5,51,20.844146,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


The New Test data is now the same format as the Training data after Data Preparation, except for the Target feature, 'Survived_1_year', as required.

### Checking missing values

In [None]:
new_test_df.isnull().sum()

ID_Patient_Care_Situation    0
Diagnosed_Condition          0
Patient_ID                   0
Treated_with_drugs           0
Patient_Age                  0
Patient_Body_Mass_Index      0
Patient_Smoker               0
Patient_Rural_Urban          0
Patient_mental_condition     0
A                            0
B                            0
C                            0
D                            0
E                            0
F                            0
Z                            0
Number_of_prev_cond          0
dtype: int64

New test data has no missing values. This data will be good for testing.

### **Save Data**

In [None]:
# pickle data
training_df.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/train_data_4_EDA.pkl')

# pickle new_test_data
new_test_df.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Bootcamp/Heart_Patient/Data/test_data_4_EDA.pkl')