In [None]:
! kaggle datasets download -d ketangangal/heart-disease-dataset-uci

Dataset URL: https://www.kaggle.com/datasets/ketangangal/heart-disease-dataset-uci
License(s): CC0-1.0
Downloading heart-disease-dataset-uci.zip to /content
  0% 0.00/9.24k [00:00<?, ?B/s]
100% 9.24k/9.24k [00:00<00:00, 13.0MB/s]


In [None]:
! unzip /content/heart-disease-dataset-uci.zip

Archive:  /content/heart-disease-dataset-uci.zip
  inflating: HeartDiseaseTrain-Test.csv  


About this file
Age
Sex : male : 1
female : 0

chest pain type
-- Value 1: typical angina
-- Value 2: atypical angina
-- Value 3: non-anginal pain
-- Value 4: asymptomatic

resting blood pressure (in mm Hg on admission to the hospital

serum cholestoral in mg/dl

(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

resting electrocardiographic results

-- Value 0: normal
-- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
-- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach: maximum heart rate achieved

exercise induced angina (1 = yes; 0 = no)

Angina is chest pain or discomfort caused when your heart muscle doesn't get enough oxygen-rich blood.
It may feel like pressure or squeezing in your chest.

oldpeak = ST depression induced by exercise relative to rest

slope: the slope of the peak exercise ST segment

--Value 1: upsloping
-- Value 2: flat
-- Value 3: downsloping

vessels colored by flourosopy : number of major vessels (0-3) colored by flourosopy

A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)

Target : 0 No Heart disease
1 Heart disease

In [None]:
import pandas as pd

# 데이터셋 로드
data = pd.read_csv('/content/HeartDiseaseTrain-Test.csv')

# 데이터의 처음 몇 줄을 출력하여 구조 확인
print("#"*10)
print("데이터의 처음 몇 줄을 출력하여 구조 확인")
print(data.head())

# 데이터의 각 컬럼에 대한 정보 확인
print("#"*10)
print("데이터의 각 컬럼에 대한 정보 확인")
print(data.info())

##########
데이터의 처음 몇 줄을 출력하여 구조 확인
   age     sex chest_pain_type  resting_blood_pressure  cholestoral  \
0   52    Male  Typical angina                     125          212   
1   53    Male  Typical angina                     140          203   
2   70    Male  Typical angina                     145          174   
3   61    Male  Typical angina                     148          203   
4   62  Female  Typical angina                     138          294   

      fasting_blood_sugar               rest_ecg  Max_heart_rate  \
0    Lower than 120 mg/ml  ST-T wave abnormality             168   
1  Greater than 120 mg/ml                 Normal             155   
2    Lower than 120 mg/ml  ST-T wave abnormality             125   
3    Lower than 120 mg/ml  ST-T wave abnormality             161   
4  Greater than 120 mg/ml  ST-T wave abnormality             106   

  exercise_induced_angina  oldpeak        slope vessels_colored_by_flourosopy  \
0                      No      1.0  Downsloping 

In [None]:
# 데이터 타입 확인
print("#"*10)
print("데이터 타입 확인")
print("Data Types:\n", data.dtypes)

# 범주형 및 수치형 데이터 분리하여 분석
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

print("#"*10)
print("범주형 데이터 분리하여 분석")
print("\nCategorical Columns:\n", categorical_cols)

print("#"*10)
print("수치형 데이터 분리하여 분석")
print("\nNumerical Columns:\n", numerical_cols)

##########
데이터 타입 확인
Data Types:
 age                                int64
sex                               object
chest_pain_type                   object
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar               object
rest_ecg                          object
Max_heart_rate                     int64
exercise_induced_angina           object
oldpeak                          float64
slope                             object
vessels_colored_by_flourosopy     object
thalassemia                       object
target                             int64
dtype: object
##########
범주형 데이터 분리하여 분석

Categorical Columns:
 Index(['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg',
       'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy',
       'thalassemia'],
      dtype='object')
##########
수치형 데이터 분리하여 분석

Numerical Columns:
 Index(['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate',
       'oldpeak', 

In [None]:
# 결측치 확인
print("\nMissing Values:\n", data.isnull().sum())

# 각 범주형 변수의 유니크한 값과 빈도수
for col in categorical_cols:
    print(f"\nUnique values in {col}:\n", data[col].value_counts())

# 수치형 데이터의 기초 통계
print("\nDescriptive Statistics for Numerical Data:\n", data[numerical_cols].describe())



Missing Values:
 age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

Unique values in sex:
 sex
Male      713
Female    312
Name: count, dtype: int64

Unique values in chest_pain_type:
 chest_pain_type
Typical angina      497
Non-anginal pain    284
Atypical angina     167
Asymptomatic         77
Name: count, dtype: int64

Unique values in fasting_blood_sugar:
 fasting_blood_sugar
Lower than 120 mg/ml      872
Greater than 120 mg/ml    153
Name: count, dtype: int64

Unique values in rest_ecg:
 rest_ecg
ST-T wave abnormality           513
Nor

In [None]:
# 왜도와 첨도 확인
"""
왜도(Skewness): 0에 가까울수록 정규분포에 근사, 양의 값은 오른쪽 꼬리가 긴 분포(왼쪽으로 치우친), 음의 값은 왼쪽 꼬리가 긴 분포(오른쪽으로 치우친)
첨도(Kurtosis): 0에 가까울수록 정규분포에 근사, 높으면 분포가 뾰족하고, 낮으면 평평
"""
"""
print("\nSkewness of the data:\n", data[numerical_cols].skew())
print("\nKurtosis of the data:\n", data[numerical_cols].kurt())


Skewness of the data:
 age                      -0.248866
resting_blood_pressure    0.739768
cholestoral               1.074073
Max_heart_rate           -0.513777
oldpeak                   1.210899
target                   -0.052778
dtype: float64

Kurtosis of the data:
 age                      -0.525618
resting_blood_pressure    0.991221
cholestoral               3.996803
Max_heart_rate           -0.088822
oldpeak                   1.314471
target                   -2.001123
dtype: float64


In [None]:
"""
상관계수 값이 1에 가까울수록 완벽한 양의 상관관계, -1에 가까울수록 완벽한 음의 상관관계를 나타냅니다.
"""

# 피어슨 상관 계수
print("Pearson Correlation:\n", data[numerical_cols].corr(method='pearson'))

# 스피어만 상관 계수
print("\nSpearman Correlation:\n", data[numerical_cols].corr(method='spearman'))

Pearson Correlation:
                              age  resting_blood_pressure  cholestoral  \
age                     1.000000                0.271121     0.219823   
resting_blood_pressure  0.271121                1.000000     0.127977   
cholestoral             0.219823                0.127977     1.000000   
Max_heart_rate         -0.390227               -0.039264    -0.021772   
oldpeak                 0.208137                0.187434     0.064880   
target                 -0.229324               -0.138772    -0.099966   

                        Max_heart_rate   oldpeak    target  
age                          -0.390227  0.208137 -0.229324  
resting_blood_pressure       -0.039264  0.187434 -0.138772  
cholestoral                  -0.021772  0.064880 -0.099966  
Max_heart_rate                1.000000 -0.349796  0.422895  
oldpeak                      -0.349796  1.000000 -0.438441  
target                        0.422895 -0.438441  1.000000  

Spearman Correlation:
                

In [None]:
from scipy import stats

"""
T-통계량의 절대값이 크면 클수록 두 그룹 간의 차이가 크다고 할 수 있습니다.

일반적으로 P-값이 0.05보다 작으면 귀무 가설을 기각하고, 통계적으로 유의미한 차이가 있음을 인정합니다
"""

# 남성과 여성 그룹 데이터 분할
male_max_hr = data[data['sex'] == 'Male']['Max_heart_rate']
female_max_hr = data[data['sex'] == 'Female']['Max_heart_rate']

# 두 그룹 간의 평균 최대 심박수 차이 검정
t_stat, p_val = stats.ttest_ind(male_max_hr, female_max_hr)
print(f"T-statistic: {t_stat}, P-value: {p_val}")


T-statistic: -1.5808436320867354, P-value: 0.1142229084355055
