# EDA 변이통계량

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import *

In [2]:
import numpy as np
# numpy float 출력옵션 변경
# np.set_printoptions(precision=3)
# np.set_printoptions(precision=20, suppress=True)
# pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(formatter={'float_kind': lambda x: "{0:0.3f}".format(x)})

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

## 범위

In [4]:
np.random.seed(123)
data = np.random.normal(100,20,size=1000)
data[:10]

array([78.287, 119.947, 105.660, 69.874, 88.428, 133.029, 51.466, 91.422,
       125.319, 82.665])

In [5]:
np.min(data), np.max(data)

(35.37889984161376, 171.43158436052622)

In [6]:
# 범위 = 최대값 - 최소값
np.max(data)-np.min(data)

136.05268451891246

In [7]:
# numpy.ptp() 이용
np.ptp(data)

136.05268451891246

## 중간범위

In [8]:
# 중간 범위 계산
(np.max(data)+np.min(data))/2

103.40524210106999

## 사분위간 범위

In [9]:
np.quantile(data, 0.75)-np.quantile(data, 0.25)

27.06844676167337

In [10]:
iqr(data)

27.06844676167337

## 사분위수 편차

In [11]:
(np.quantile(data, 0.75)-np.quantile(data, 0.25))/2

13.534223380836686

In [12]:
iqr(data)/2

13.534223380836686

## 편차

* 자료값과 평균과의 차이
* xi-mean()

In [13]:
(data-np.mean(data)).sum()

-7.815970093361102e-12

**분산계산 : var(a,ddof=0)**

In [14]:
x = [1, 2, 3, 4, 5]

# 표본분산, ddof=1
np.var(x, ddof=1)

# 모분산
np.var(x)

# 모분산, ddof=0
np.var(x, ddof=0)
pd.Series(x).var(ddof=0)

2.5

2.0

2.0

2.0

## 표준편차

In [15]:
x = [1, 2, 3, 4, 5]

# 표본표준편차(S)
np.std(x, ddof=1)

# 모표준편차 (sigma)
np.std(x)

# np.array(x).std(x)
pd.Series(x).std(ddof=0)

1.5811388300841898

1.4142135623730951

1.4142135623730951

## 변동계수

In [16]:
men = [72,74,77,68,66,75]
women = [45,48,52,53,46,50]

print('평균')
np.mean(men)
np.mean(women)
print('표본표준편차')
np.std(men,ddof=1)
np.std(women,ddof=1)

평균


72.0

49.0

표본표준편차


4.242640687119285

3.22490309931942

In [17]:
# np.std(x, axis=axis, ddof=ddof) / np.mean(x) 이용

print('남자CV:',np.std(men, ddof=1)/np.mean(men))
print('여자CV:',np.std(women, ddof=1)/np.mean(women))

남자CV: 0.05892556509887895
여자CV: 0.06581434896570246


In [21]:
print('남자CV:',variation(men))
print('여자CV:', variation(women))

남자CV: 0.053791435363991905
여자CV: 0.06008000589338671


## 데이터 정규화

: scaling(표준화)

* 각 값들을 상대적인 값으로 변화시키는 기법
* 예. 국어 평균 95점, 수학 평균 30인 경우

**표준화 예제**

In [29]:
df = pd.read_csv('./data/ch2_scores_em.csv',index_col='student number')
df.head()

Unnamed: 0_level_0,english,mathematics
student number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [30]:
df['english'].describe()

count    50.000000
mean     58.380000
std       9.799813
min      37.000000
25%      54.000000
50%      57.500000
75%      65.000000
max      79.000000
Name: english, dtype: float64

In [31]:
df['mathematics'].describe()

count    50.000000
mean     78.880000
std       8.414371
min      57.000000
25%      76.000000
50%      80.000000
75%      84.000000
max      94.000000
Name: mathematics, dtype: float64

In [32]:
df.describe()

Unnamed: 0,english,mathematics
count,50.0,50.0
mean,58.38,78.88
std,9.799813,8.414371
min,37.0,57.0
25%,54.0,76.0
50%,57.5,80.0
75%,65.0,84.0
max,79.0,94.0


In [33]:
# Z-scaling - 평균이 0 표준편차가 1이됨
z1 = (df['english']-df['english'].mean())/df['english'].std()
z2 = (df['mathematics']-df['mathematics'].mean())/df['mathematics'].std()

print(z1.min(), z1.max())
print(z2.min(), z2.max())
# -3 ~ 3 사이의 값으로 분포됨

-2.1816743772942324 2.104121873704727
-2.600313324789425 1.796925844187209


min-max scaling

In [36]:
s1 = (df['english']-df['english'].min())/(df['english'].max()-df['english'].min())
s2 = (df['mathematics']-df['mathematics'].min())/(df['mathematics'].max()-df['mathematics'].min())

print('eng : ', s1.min(), s1.max())
print('math : ', s2.min(), s2.max())

eng :  0.0 1.0
math :  0.0 1.0


스케일링

In [37]:
df.head()

Unnamed: 0_level_0,english,mathematics
student number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,42,65
2,69,80
3,56,63
4,41,63
5,57,76


In [40]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
S = scaler.fit_transform(df)
pd.DataFrame(S, columns = df.columns, index=df.index).head()

Unnamed: 0_level_0,english,mathematics
student number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.119048,0.216216
2,0.761905,0.621622
3,0.452381,0.162162
4,0.095238,0.162162
5,0.47619,0.513514
