# 산포통계

In [2]:
import numpy as np
from scipy import stats
import pandas as pd

## 분산 계산

In [5]:
x = [1, 2, 3, 4, 5]
print(f'모집단의 분산 : {np.var(x)}') #모집단의 분산, ddof =0 <- 생략
print(f'표본의 분산 : {np.var(x, ddof =1)}') # 분모를 계산할 때 n - 1(ddof) ( 5 - 1), 표본
print(np.array(x).var()) # 분모 = n
# 분모 = n , 자유도(ddof) 0:모분산인경우, 표본분산
print(pd.Series(x).var(ddof = 0))

모집단의 분산 : 2.0
표본의 분산 : 2.5
2.0
2.0


## 표준편차 계산

In [6]:
x = [1, 2, 3, 4, 5]
print(np.std(x, ddof = 1))
print(np.array(x).std(ddof = 0))
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


## 변동계수의 필요성
- 분산과 표준편차 모두 값의 스케일에 크게 영향을 받아 상대적인 산포를 보여주는데 부작합함.
- 변동 계수 = 표준편차 / 평균

In [7]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 *10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


In [8]:
print(stats.variation(x1)) # 변동계수
print(stats.variation(x2))

0.47140452079103173
0.4714045207910317


In [12]:
print(np.std(x1, ddof = 1) / np.mean(x1))
print(np.std(x2, ddof = 1) / np.mean(x2))

0.5270462766947299
0.5270462766947299


## 스케일링
- 둘 이상의 변수의 값을 상대적으로 비교할 때 사용

In [13]:
import numpy as np
import pandas as pd

In [20]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10
x1


array([1, 2, 3, 4, 5])

In [21]:
x2

array([10, 20, 30, 40, 50])

In [24]:
# Standard Scaling
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [25]:
# Min-max Scaling
z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())

print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


In [None]:
# sklearn을 이용한 스케일링을 위한 데이터 준비


## 데이터 표준화하기

### 데이터 프레임 만들기

In [14]:
import pandas as pd

In [22]:
x = pd.DataFrame({ "X1": [1,2,3,4,5], 'X2' : [10,20,30,40,50]})    

### scikit learn을 활용한 데이터 표준화하기

In [23]:
!pip install scikit-learn



In [24]:
# MinMaxScaler 메모리에 로딩
from sklearn.preprocessing import MinMaxScaler

In [26]:
#MinMaxScaler 객체 생성
scaler = MinMaxScaler()
scaled = scaler.fit_transform(x)

In [27]:
scaled

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [28]:
# docstrin 불러오기 : shift + tab
# 자동완성: tab

In [30]:
pd.DataFrame(scaled, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


### StandardScaler로 표준화 하기

In [31]:
from sklearn.preprocessing import StandardScaler

In [34]:
ss_scaler = StandardScaler() #인스턴스화
z = ss_scaler.fit_transform(x) #fit_transform => ndarray
pd.DataFrame(z, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
ss_scaler = StandardScaler()
S =ss_scaler.fit_transform(x)
pd.DataFrame(S, columns=['x1', 'x2'])

Unnamed: 0,x1,x2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


### 범위와 사분위 범위 계산하기

In [37]:
import numpy as np

In [38]:
x = np.random.normal(100, 20, size = 1000) #(평균, 편차, size = 갯수)

In [39]:
# x (소문자) : 칼럼이 1개일 경우
# X (대문자) : 칼럼이 2개이상일 경우

In [40]:
x

array([ 94.95140039,  73.4014742 , 113.82498213,  89.68678679,
       103.30483613,  97.4363265 , 109.56785101,  91.51873843,
        86.26377625, 138.60679082,  99.65478545,  80.52270805,
        44.22746971,  64.39638177, 100.51055325, 112.67427676,
        83.04922367, 113.29113946, 115.17553252,  95.12508638,
        89.02186226,  66.08381918, 115.19736838, 111.26421992,
       102.37708041, 113.73396706, 142.14804569, 122.06918006,
        84.37129045,  47.97880359, 108.85704929,  91.45191188,
        81.77379207,  68.62769211, 113.97902258, 128.9537048 ,
        81.83537894, 110.59937289,  80.69225696,  73.62451051,
       101.27946511,  90.10497924, 140.42108918, 105.99067647,
       118.53707475,  98.08261435, 117.05736769,  81.45995673,
       102.50243243,  49.02802259,  92.93130734,  64.59700583,
        95.09932027,  55.77072541, 108.88998938,  79.79235313,
        71.60196703,  80.70945407,  82.83018845,  84.74045063,
        90.53148787,  74.48107715, 101.37480481, 106.26

### 범위 계산

- m :문자 셀로 변환
- y : 코드 셀로 변환

In [46]:
# from scipy import stats as st
import scipy.stats as st

In [47]:
print(np.ptp(x)) # peak to peak - 최댓값 - 최솟값 사의의 범위 
print(np.max(x) - np.min(x))

113.04330223566805
113.04330223566805


In [48]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))
print(st.iqr(x))

26.05474302387273
26.05474302387273
