# EDA(Exploratory Data Analysis, 탐색적 데이터 분석)

- 데이터를 다양한 측면에서 바라보고 이해하는 과정을 의미
- 통계적 요약, 분포 파악 및 시각화 기법 사용
- 직관적 데이터 특성 파악

요인별 EDA 유형 구분
- 데이터의 변수 개수가 몇 개인지에 따라 일변량, 다변량으로 나눔

| | 일변량 | 다변량|
| ---- | ---- | ---- |
| 비시각화 | 빈도표, 기술통계량 | 교차표, 상관계수 |
| 시각화 | 파이차트, 막대그래프, 히스토그램, 박스플롯 | 모자이크플롯, 박스플롯, 평행좌표, 산점도 |

In [1]:
import numpy as np
import pandas as pd

In [13]:
data = pd.read_excel('./boston_housing.xls')
housing_data = data

In [20]:
housing_data = housing_data.astype(dtype='float64', copy=True) # 실습 위해서 똑같이 타입 맞춤
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    float64
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    float64
 9   TAX        506 non-null    float64
 10  PTRATIO    506 non-null    float64
 11  B          506 non-null    float64
 12  LSTAT      506 non-null    float64
 13  MEDV       506 non-null    float64
 14  CAT. MEDV  506 non-null    float64
dtypes: float64(15)
memory usage: 59.4 KB


In [22]:
housing_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,0.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,0.0
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,1.0
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,1.0
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,1.0


In [23]:
housing_data.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4,0.0
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6,0.0
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9,0.0
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0,0.0
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9,0.0


In [24]:
housing_data = housing_data.astype({'CHAS':'object'})
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    object 
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    float64
 9   TAX        506 non-null    float64
 10  PTRATIO    506 non-null    float64
 11  B          506 non-null    float64
 12  LSTAT      506 non-null    float64
 13  MEDV       506 non-null    float64
 14  CAT. MEDV  506 non-null    float64
dtypes: float64(14), object(1)
memory usage: 59.4+ KB


In [26]:
pd.crosstab(housing_data.CHAS, columns='count')
# 강 경계에 있지 않은 town 471개 지역, 경계에 위치한 town이 35개 지역으로 차이를 보임

col_0,count
CHAS,Unnamed: 1_level_1
0.0,471
1.0,35


In [31]:
# 비율 환산
pd.crosstab(housing_data.CHAS, columns='count', normalize=True)

col_0,count
CHAS,Unnamed: 1_level_1
0.0,0.93083
1.0,0.06917


In [32]:
# 합 추가
pd.crosstab(housing_data.CHAS, columns='count', normalize=True, margins=True)

col_0,count,All
CHAS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.93083,0.93083
1.0,0.06917,0.06917
All,1.0,1.0


In [35]:
# 4분위수 기반의 'IQR', IQR 연산
q1 = housing_data['CRIM'].quantile(0.25)
q3 = housing_data['CRIM'].quantile(0.75)
iqr = q3 - q1

print('q1', q1)
print('q3', q3)
print('iqr', iqr)

q1 0.08204499999999999
q3 3.6770825
iqr 3.5950375


### 왜도와 첨도 확인

- 왜도: 분포의 비대칭성 나타내는 척도. 0 보다 크면 왼쪽으로 치우치고, 꼬리가 긴 형태의 분포가 나타남
- 첨도: 분포의 뾰족한 정도를 나타내는 척도. 평균에 관측치가 얼마나 모였는지 확인. 0보다 크면 뾰족한 모양을 지님

In [36]:
print('skewness:', round(housing_data['CRIM'].skew(), 4))
print('kurtosis:', round(housing_data['CRIM'].kurt(), 4))

skewness: 5.2231
kurtosis: 37.1305


In [37]:
print('skewness:', round(housing_data['MEDV'].skew(), 4))
print('kurtosis:', round(housing_data['MEDV'].kurt(), 4))

skewness: 1.1081
kurtosis: 1.4952


In [39]:
# 로그 변환으로 왜도 첨도 정도를 줄임
print('skewness:', round(np.log(housing_data['CRIM'].skew()), 4))
print('kurtosis:', round(np.log(housing_data['CRIM'].kurt()), 4))

skewness: 1.6531
kurtosis: 3.6144


### Pandas-Profiling 패키지를 통한 EDA 자동화 및 리포트 생성

In [41]:
import pandas_profiling
from pandas_profiling import ProfileReport

ImportError: cannot import name 'DataError' from 'pandas.core.base' (c:\tools\Anaconda3\envs\aivle\lib\site-packages\pandas\core\base.py)