# Explorative Data Analysis PTB-XL

Exploratory analysis of the [PTB-XL dataset](https://physionet.org/content/ptb-xl/1.0.1/).

## Import

### Library import

In [None]:
import pandas as pd
import numpy as np
import wfdb
import ast
from tqdm import tqdm
import warnings; warnings.filterwarnings('ignore')
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.style.use([s for s in plt.style.available if 'whitegrid' in s][0])
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 100

### Data import 

In [None]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in tqdm(df.filename_lr)]
    else:
        data = [wfdb.rdsamp(path+f) for f in tqdm(df.filename_hr)]
    data = np.array([signal for signal, meta in data])
    return data

path = "data/ptbxl/"
sampling_rate=100

# load and convert annotation data
ptbxl = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
ptbxl.scp_codes = ptbxl.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
raw = load_raw_data(ptbxl, sampling_rate, path)

print('data shpae:', raw.shape)
print(ptbxl[['scp_codes']])
ptbxl.head()

In [None]:
ptbxl.columns

In [None]:
# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path+'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]
print(agg_df.shape)
print(agg_df.columns)
agg_df.head()

In [None]:
new_columns = agg_df["diagnostic_class"].unique().tolist()
print(new_columns)
for el in new_columns:
    ptbxl[el] = 0
ptbxl['diagnostic_superclass_len'] = 0
print(ptbxl.columns, "\n")

for ecg_id in ptbxl.index:
    for item in ptbxl.loc[ecg_id].scp_codes:
        if (item in agg_df.index) and (ptbxl.loc[ecg_id, agg_df.loc[item].diagnostic_class] == 0):
            ptbxl.at[ecg_id, agg_df.loc[item].diagnostic_class] = 1
            ptbxl.at[ecg_id, 'diagnostic_superclass_len'] += 1

counts = ptbxl[new_columns].sum()
print(counts)

print("\nTotal:", counts.sum())
print("\nTotal patients:", len(ptbxl))
print(f'{counts.sum()/len(ptbxl)*100:.2f}%')

- NORM: Normal
- MI: Myocardial Infarction
- STTC: ST-T wave abnormality
- CD: Cardiomyopathy
- HYP: Hypertrophy

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=counts.index, y=counts.values, palette='gray')

total = counts.sum()
for i, v in enumerate(counts):
    ax.text(i, v + 50, f'{v}', ha='center', va='bottom', fontsize=10)

plt.title('Distribution of new_columns')
plt.xlabel('Columns')
plt.ylabel('Number of cases')

plt.show()

In [None]:
vc = ptbxl['diagnostic_superclass_len'].value_counts()

bar,ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(x=vc.values/vc.values.sum()*100., y=vc.index, ci=None, palette="gray",orient='h' )
ax.set_title("Diagnostic Superclass Len Distribution", fontsize=20)
ax.set_xlabel ("percentage over all samples")
ax.set_ylabel ("")
for rect in ax.patches:
    ax.text (rect.get_width(), rect.get_y() + rect.get_height() / 2,"%.1f%%"% rect.get_width(), weight='bold' )

### Sex

In [None]:
ptbxl['sex'] = ptbxl['sex'].replace({0: 'Male', 1: 'Female'})

In [None]:
counts = ptbxl['sex'].value_counts()

bars = plt.bar(counts.index, counts.values, color=['black', 'gray'])

plt.title('Distribution by sex')
plt.xlabel('Sex')
plt.ylabel('Number of cases')

total = counts.sum()
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 50, f'{(yval/total)*100:.1f}%', ha='center', va='bottom')

plt.show()

In [None]:
# Calcola il conteggio di ciascuna malattia per genere
counts = ptbxl.groupby('sex')[new_columns].sum().T

# Crea il grafico a barre orizzontale utilizzando pandas
ax = counts.plot.barh(color=['gray', 'black'], figsize=(12,6))

# Aggiungi titoli e etichette agli assi
plt.title('Distribution of diseases by gender')
plt.xlabel('Number of cases')
plt.ylabel('Diseases')

# Calcola le percentuali e aggiungi a fianco delle barre
for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{width/counts.sum().sum()*100:.1f}%', (x + width + 10, y + height/2), va='center')

# Mostra il grafico
plt.show()

### Age

In [None]:
import seaborn as sns

# Crea il distplot utilizzando seaborn
sns.distplot(ptbxl['age'], kde=True, color='gray')

# Aggiungi titoli e etichette agli assi
plt.title('Age distribution')
plt.xlabel('Age')
plt.ylabel('Number of cases')

# Mostra il grafico
plt.show()

In [None]:
# Ristruttura i dati in modo che ogni riga rappresenti un singolo caso di malattia
data = pd.melt(ptbxl, id_vars='age', value_vars=new_columns, var_name='disease', value_name='case')

# Filtra solo i casi di malattia
data = data[data['case'] == 1]

# Crea il kdeplot utilizzando seaborn
fig, ax1 = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=data, x='age', hue='disease', ax=ax1, palette='gray')

# Crea un secondo asse y e crea un istogramma della distribuzione dell'età
ax2 = ax1.twinx()
sns.histplot(ptbxl['age'], color='gray', ax=ax2, edgecolor=None)

# Aggiungi titoli e etichette agli assi
ax1.set_title('Age distribution of diseases')
ax1.set_xlabel('Age')
ax1.set_ylabel('Density')
ax2.set_ylabel('Number of cases')

# Mostra solo la griglia per l'asse y del secondo asse
ax2.grid(False)

# Mostra il grafico
plt.show()

### Height

In [None]:
np.sort(ptbxl['height'].unique())

In [None]:
# Ristruttura i dati in modo che ogni riga rappresenti un singolo caso di malattia
data = pd.melt(ptbxl, id_vars='height', value_vars=new_columns, var_name='disease', value_name='case')

# Filtra solo i casi di malattia
data = data[data['case'] == 1]

# Crea il kdeplot utilizzando seaborn
fig, ax1 = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=data, x='height', hue='disease', ax=ax1, palette='gray')

# Crea un secondo asse y e crea un istogramma della distribuzione dell'altezza
ax2 = ax1.twinx()
sns.histplot(ptbxl['height'], color='gray', ax=ax2, edgecolor=None)

# Aggiungi titoli e etichette agli assi
ax1.set_title('Height distribution of diseases')
ax1.set_xlabel('Height')
ax1.set_ylabel('Density')
ax2.set_ylabel('Number of cases')

# Mostra solo la griglia per l'asse y del secondo asse
ax2.grid(False)

# Mostra il grafico
plt.show()

In [None]:
ptbxl.columns

### Validation

In [None]:
ptbxl.groupby("validated_by_human").size().reset_index(name='count').set_index("validated_by_human")

In [None]:
ptbxl.groupby(["validated_by_human", "validated_by"]).size().reset_index(name='count').set_index(["validated_by_human", "validated_by"])

In [None]:
ptbxl["validated_by"].unique()

In [None]:
ptbxl.groupby(["validated_by_human", "validated_by"]).size().reset_index(name='count').set_index(["validated_by_human", "validated_by"])

In [None]:
plt.figure(figsize=(12, 6))
ax = sns.countplot(data=ptbxl, x='validated_by_human', palette='gray')

# Add titles and labels to the axes
plt.title('Bar plot of validated_by_human')
plt.xlabel('validated_by_human')
plt.ylabel('Count')

# Calculate the percentages and counts
total = len(ptbxl['validated_by_human'])
for p in ax.patches:
    count = int(p.get_height())
    percentage = '{:.1f}%'.format(100 * count/total)
    annotation = f'{count} - ({percentage})'
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height() + 200
    ax.annotate(annotation, (x, y), ha='center')


plt.show()

### Device

In [None]:
ptbxl.groupby("device").size().reset_index(name='count').set_index("device")

In [None]:
ptbxl.groupby(["validated_by_human", "device"]).size().reset_index(name='count').set_index(["validated_by_human", "device"])