In [120]:
import pandas as pd
import lightningchart as lc

## Data handling

In [121]:
df = pd.read_csv('data/heart.csv')
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


### Checking for types and null values

In any data analysis task it is a good idea to check the columns of the `dataframe` for types. It is done to understand what kind of data is contained in the table. 

It is done using `dtypes`.

In [122]:
df.dtypes

age           int64
sex           int64
cp            int64
trtbps        int64
chol          int64
fbs           int64
restecg       int64
thalachh      int64
exng          int64
oldpeak     float64
slp           int64
caa           int64
thall         int64
output        int64
dtype: object

You can check how many null values each column contains by using `isnull().sum()` method. The first part of this function checks the columns for null values and then returns the number of those in each column.

In [123]:
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

It is convenient that this data already has string and object-type values converted to integers and floats. But we need a documentation to understand which categorical number values (for example, in cp) mean what.

We can use this documentation from Kaggle:
* Age : Age of the patient
* Sex : Sex of the patient
* exang: exercise induced angina (1 = yes; 0 = no)
* ca: number of major vessels (0-3)
* cp : Chest Pain type chest pain type
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain
    - Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)
* chol : cholesterol in mg/dl fetched via BMI sensor
* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* rest_ecg : resting electrocardiographic results
    - Value 0: normal
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved

* target : 0= less chance of heart attack 1= more chance of heart attack

## Data Analysis

### Correlation Matrix
We will use correlation matrix for our numeric (non-categorical) values

In [124]:
numeric_columns = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
data_numeric = df[numeric_columns]
data_numeric.corr()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak
age,1.0,0.279351,0.213678,-0.398522,0.210013
trtbps,0.279351,1.0,0.123174,-0.046698,0.193216
chol,0.213678,0.123174,1.0,-0.00994,0.053952
thalachh,-0.398522,-0.046698,-0.00994,1.0,-0.344187
oldpeak,0.210013,0.193216,0.053952,-0.344187,1.0


Based on above correlation matrix, you can see how the variables correlate between each other.

Numbers represent strength of correlation:

0.7 to 1.0 (-0.7 to -1.0) -> Strong correlation.
0.5 to 0.7 (-0.5 to -0.7) -> Moderate correlation.
0.3 to 0.5 (-0.3 to -0.5) -> Weak correlation.
0.0 to 0.3 (0.0 to -0.3) -> Negligible correlation.
Positive numbers mean positive correlation (as one value increases, the other does too).
Negative numbers mean negative correlation (as one value increases, the other decreases).

## Plots for Multi-variable analysis

Firstly, it is needed to set license key for our LightningChart.

In [125]:
with open("license_key.txt", "r") as file:  # License key is stored in 'license_key.txt'
    key = file.read()
lc.set_license(key)

Then we can create a scatter plot based on two parameters. For example, cholesterol and heartrate.

Scatter plot is nice because it can provide not only the information about contribution of the parameters to the output, but also it shows the correlation between the 2 (or 3, if it is 3D scatter chart) parameters. 

In [126]:
chol_neg = df.loc[df['output'] == 0, 'chol'].tolist()
heartrate_neg = df.loc[df['output'] == 0, 'thalachh'].tolist()

chol_pos = df.loc[df['output'] == 1, 'chol'].tolist()
heartrate_pos = df.loc[df['output'] == 1, 'thalachh'].tolist()

scatter = lc.ChartXY(   # scatter chart intialization
    theme=lc.Themes.White,  # overall theme
    title='Scatter Chart (Cholesterol vs Heartrate)', 
)
series_neg = scatter.add_point_series().add(
    x=chol_neg,  # push samples to series
    y=heartrate_neg,
)
series_pos = scatter.add_point_series().add(
    x=chol_pos,  
    y=heartrate_pos,
)

series_neg.set_point_color(lc.Color(0, 255, 0, 192)).set_name("No Heart Attack")
series_pos.set_point_color(lc.Color(255, 0, 0, 192)).set_name("Heart Attack")

scatter.get_default_x_axis().set_title("Cholesterol")
scatter.get_default_y_axis().set_title("Heartrate")
scatter.add_legend().add(scatter)
scatter.open()

It can be seen that the higher the heartrate, the more risk of the heart attack. Also, we can see that generally, the higher the cholesterol, the higher the heartrate.   
The correlation between cholesterol and the output is not very obvious, so we can make a box plots fot heart rate and cholesterol.

In [127]:
boxplt_heartrate = lc.BoxPlot(  # init box plot
    data=[heartrate_neg, heartrate_pos],
    theme=lc.Themes.White,
    title='Heartrate',
    xlabel='No Heart Attack (Left), Heart Attack (Right)',
    ylabel='Values'
)
boxplt_heartrate.open()

In [128]:
thalach_heart_attack_mean = df.groupby('output')['thalachh'].mean()
print("Mean Maximum Heart Rate")
print(thalach_heart_attack_mean)

Mean Maximum Heart Rate
output
0    139.101449
1    158.466667
Name: thalachh, dtype: float64


In [129]:
boxplt_chol = lc.BoxPlot(  # init box plot
    data=[chol_neg, chol_pos],
    theme=lc.Themes.White,
    title='Cholesterol',
    xlabel='No Heart Attack (Left), Heart Attack (Right)',
    ylabel='Values'
)
boxplt_chol.open()


We can see that the extreme number of cholesterol relates to heart attack, but it is not necessary that those who have lower cholesterol are not subject to heart attacks.

The next scatter chart parameters are cholesterol and heart pressure.

We can also explore the correlation between heartrate and exercise-induced angina. Another boxplot.

In [130]:
heartrate_angina = df.loc[df['exng'] == 1, 'thalachh'].tolist()
heartrate_noangina = df.loc[df['exng'] == 0, 'thalachh'].tolist()

boxplt_chol = lc.BoxPlot(  
    data=[heartrate_noangina, heartrate_angina],
    theme=lc.Themes.White,
    title='Heartrate',
    xlabel='No Angina (Left), Angina (Right)',
    ylabel='Values'
)
boxplt_chol.open()

In [131]:
thalach_exang_mean = df.groupby('exng')['thalachh'].mean()
print("Mean Maximum Heart Rate by Exercise-Induced Angina:")
print(thalach_exang_mean)

Mean Maximum Heart Rate by Exercise-Induced Angina:
exng
0    155.681373
1    137.212121
Name: thalachh, dtype: float64


It can be seen that individuals with exercise-induced angina have a lower mean maximum heart rate. It suggests that these individuals may experience cardiovascular limitations. Angina is typically a sign that the heart muscle is not getting enough oxygen-rich blood during exertion, which could prevent these individuals from reaching higher heart rates.

In [132]:
bp_neg = df.loc[df['output'] == 0, 'trtbps'].tolist()

bp_pos = df.loc[df['output'] == 1, 'trtbps'].tolist()

scatter = lc.ChartXY(   # scatter chart intialization
    theme=lc.Themes.White,  # overall theme
    title='Scatter Chart (Cholesterol vs Pressure)', 
)
series_neg = scatter.add_point_series().add(
    x=chol_neg,  # push samples to series
    y=bp_neg,
)
series_pos = scatter.add_point_series().add(
    x=chol_pos,  
    y=bp_pos,
)

series_neg.set_point_color(lc.Color(0, 255, 0, 192)).set_name("No Heart Attack")
series_pos.set_point_color(lc.Color(255, 0, 0, 192)).set_name("Heart Attack")

scatter.get_default_x_axis().set_title("Cholesterol")
scatter.get_default_y_axis().set_title("Blood Pressure")
scatter.add_legend().add(scatter)
scatter.open()

Let's also make a box blot for the blood pressure.

In [133]:
boxplt = lc.BoxPlot(  # init box plot
    data=[bp_neg, bp_pos],
    theme=lc.Themes.White,
    title='Blood Pressure',
    xlabel='No Heart Attack (Left), Heart Attack (Right)',
    ylabel='Values'
)
boxplt.open()

In [134]:
thalach_exang_mean = df.groupby('output')['trtbps'].mean()
print("Mean Blood Pressure by Output:")
print(thalach_exang_mean)

Mean Blood Pressure by Output:
output
0    134.398551
1    129.303030
Name: trtbps, dtype: float64


Those who had a heart attack generally have less blood pressure values. It may be counterintuitive, but it shows that the blood pressure alone doesn't certainly show that the individual is subject to a heart attack or not.

### Grouped bar chart for age

For creating a bar chart, firstly we need to divide the entries to bins based on age. Then, we can create the bar chart itself.

In [135]:
bins = [29, 50, 60, 77]  
labels = ['29-50', '51-60', '61-77']

df['age_range'] = pd.cut(df['age'], bins=bins, labels=labels, right=True)

mapping = {0: 'No Heart Attack', 1: 'Heart attack'}
df['output_words'] = df['output'].map(mapping)

df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,age_range,output_words
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,61-77,Heart attack
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,29-50,Heart attack
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,29-50,Heart attack
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,51-60,Heart attack
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,51-60,Heart attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,51-60,No Heart Attack
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,29-50,No Heart Attack
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,61-77,No Heart Attack
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,51-60,No Heart Attack


In [136]:
outcome_counts_by_age = df.groupby(['age_range', 'output_words'], observed=True).size().unstack(fill_value=0)
outcome_counts_by_age

output_words,Heart attack,No Heart Attack
age_range,Unnamed: 1_level_1,Unnamed: 2_level_1
29-50,65,29
51-60,64,65
61-77,35,44


In [137]:
result = []
for target in df['output_words'].unique():  # make json-like formation of data
    values = [int(x) for x in outcome_counts_by_age[target].tolist()]  
    result.append({                     
        'subCategory': target,
        'values': values
    })
barchart_grouped = lc.BarChart(  # initialize bar chart
    vertical=True,
    theme=lc.Themes.White,
    title='Heart Attacks By Age',
)
barchart_grouped.set_data_grouped(labels, result)  # set data
barchart_grouped.set_sorting('alphabetical').set_animation_category_position(False)
barchart_grouped.add_legend().add(barchart_grouped)  # add legend
barchart_grouped.open() 