# Imports & class repartition checkout

In [1]:
import pandas as pd
import numpy as np

## Load data

In [2]:
training_data = pd.read_csv('training_data.csv')

In [3]:
training_data.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,


## Checkout initial class balance

In [4]:
training_data['Murmur'].value_counts()

Absent     695
Present    179
Unknown     68
Name: Murmur, dtype: int64

In [5]:
training_data['Outcome'].value_counts()

Normal      486
Abnormal    456
Name: Outcome, dtype: int64

In [7]:
n_observations_without_unknown = len(training_data[training_data['Murmur'] != 'Unknown'])
n_observations_without_unknown #number of observations different than 

874

In [6]:
n_observations = len(training_data) #compute number of observations

# Baseline model creation

## Definition & Result overview

This dataset gathers information about the heart' condition of brazilian patients. In the perspective of the heart disease prevention campaign, heart recordings were taken as well as a full checkup of the patients. Based only on the sound recordings, auditors (specialised doctors with heavy experience in the field) categorized the patients (COLUMN 'MURMUR') as having a murmur : Absent, Present or Unknown. 

For each of those patients, a full checkup confirmed or not the presence of a murmur => this is what we consider as truth. 

Therefore our baseline is the ability of the auditor to classify correctly a heart recordings.

NB: For 68 observations, auditor input a "Unknown" diagnostic. In the following section we will consider this class in different ways in order to have the best evaluation of auditor' performance.

<span style='color:blue '>  Baseline without taking into consideration "unknown" situation is recall = 36% </span>

<span style='color:blue '>  Baseline taking into consideration "unknown" as =Present is recall = 42% </span>

<span style='color:blue '>  Baseline taking into consideration "unknown" as "Normal" situation is recall = 32% </span>

## Baseline without taking Unknown into consideration

In [8]:
#Compute the number of rows where auditor says murmur is absent and full checkup confirms it
true_negatives = len(training_data[(training_data['Murmur'] == 'Absent') 
                    & (training_data['Outcome'] == 'Normal')])

#Compute the number of rows where auditor says murmur is present and full checkup confirms it
true_positives = len(training_data[(training_data['Murmur'] == 'Present')
                    & (training_data['Outcome'] == 'Abnormal')])

#Compute the number of rows where auditor says murmur is present but full checkup says the opposite
false_positives=len(training_data[(training_data['Murmur']=='Present')
                    & (training_data['Outcome']=='Normal')])

#Compute the number of rows where auditor says murmur is absent but full checkup says the opposite
false_negatives=len(training_data[(training_data['Murmur']=='Absent')
                    & (training_data['Outcome']=='Abnormal')])

In [12]:
print(f' Without taking into considerations the "Unknown diagnostics", we have: \n n_true_positives = {true_positives}, n_true_negatives = {true_negatives} \n n_false_positives = {false_positives}, n_false_negatives = {false_negatives}')

 Without taking into considerations the "Unknown diagnostics", we have: 
 n_true_positives = 150, n_true_negatives = 432 
 n_false_positives = 29, n_false_negatives = 263


In [13]:
recall_no_unknown = true_positives / (true_positives + false_negatives) * 100
recall_no_unknown

36.31961259079903

<span style='color:blue '>  Baseline without taking into consideration "unknown" situation is recall = 36% </span>

## Baseline if we handle Unknown as "Abnormal"

In [15]:
training_data_ua = training_data.copy()

In [17]:
#Replacing all "Unknown values by "Present" in the Murmur column

training_data_ua.loc[training_data_ua['Murmur']=='Unknown', 'Murmur'] = 'Present'

In [20]:
#Compute the number of rows where auditor says murmur is absent and full checkup confirms it
true_negatives = len(training_data_ua[(training_data_ua['Murmur'] == 'Absent') 
                    & (training_data_ua['Outcome'] == 'Normal')])

#Compute the number of rows where auditor says murmur is absent but full checkup says the opposite
false_negatives=len(training_data_ua[(training_data_ua['Murmur']=='Absent')
                    & (training_data_ua['Outcome']=='Abnormal')])

#Compute the number of rows where auditor says murmur is present and full checkup confirms it
true_positives = len(training_data_ua[(training_data_ua['Murmur'] == 'Present')
                    & (training_data_ua['Outcome'] == 'Abnormal')])

#Compute the number of rows where auditor says murmur is present but full checkup says the opposite
false_positives=len(training_data_ua[(training_data_ua['Murmur']=='Present')
                    & (training_data_ua['Outcome']=='Normal')])

In [33]:
print(f' Taking into consideration the "Unknown diagnostics" as "Present", we have: \n n_true_positives = {true_positives}, n_true_negatives = {true_negatives} \n n_false_positives = {false_positives}, n_false_negatives = {false_negatives}')

 Taking into consideration the "Unknown diagnostics" as "Present", we have: 
 n_true_positives = 150, n_true_negatives = 457 
 n_false_positives = 29, n_false_negatives = 306


In [21]:
recall_unknown_as_abnormal = true_positives / (true_positives + false_negatives) * 100
recall_unknown_as_abnormal

42.32456140350877

<span style='color:blue '>  Baseline taking into consideration "unknown" as =Present is recall = 42% </span>

## Baseline if we handle Unknown as "Normal / Absent"

In [22]:
training_data_un = training_data.copy()

In [34]:
#Replacing all "Unknown values by "Absent" in the Murmur column

training_data_un.loc[training_data_un['Murmur']=='Unknown', 'Murmur'] = 'Absent'

#Checking that change ocurred 

training_data_un['Murmur'].value_counts()

Absent     763
Present    179
Name: Murmur, dtype: int64

In [26]:
#Compute the number of rows where auditor says murmur is absent and full checkup confirms it
true_negatives = len(training_data_un[(training_data_un['Murmur'] == 'Absent') 
                    & (training_data_un['Outcome'] == 'Normal')])

#Compute the number of rows where auditor says murmur is absent but full checkup says the opposite
false_negatives=len(training_data_un[(training_data_un['Murmur']=='Absent')
                    & (training_data_un['Outcome']=='Abnormal')])

#Compute the number of rows where auditor says murmur is present and full checkup confirms it
true_positives = len(training_data_un[(training_data_un['Murmur'] == 'Present')
                    & (training_data_un['Outcome'] == 'Abnormal')])

#Compute the number of rows where auditor says murmur is present but full checkup says the opposite
false_positives=len(training_data_un[(training_data_un['Murmur']=='Present')
                    & (training_data_un['Outcome']=='Normal')])

In [27]:
recall_unknown_as_normal = true_positives / (true_positives + false_negatives) * 100
recall_unknown_as_normal

32.89473684210527

<span style='color:blue '>  Baseline taking into consideration "unknown" as =Absent is recall = 32% </span>