# 산포통계

## 분산, 표준편차 구하기

In [6]:
import numpy as np
from scipy import stats
import pandas as pd

분산 계산

In [8]:
x = [1, 2, 3, 4, 5]
#분산 계산
print(f"모집단의 분산 : {np.var(x)}") # np.var(x) #모집단의 분산, ddof = 0 <- 생략
print(np.var(x, ddof=1)) #표본데이터의 분산을 계산할 때 분모(n-ddof)
print(np.array(x).var())
print(pd.Series(x).var(ddof=0))

모집단의 분산 : 2.0
2.5
2.0
2.0


표준편차 계산

In [15]:
x = [1, 2, 3, 4, 5]
print(np.std(x, ddof =1))
print(np.array(x).std(ddof = 0))
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


변동계수의 필요성

-분산과 표준편차 모두 값의 스케일에 크게 영향을 받아 상대적인 산포를 보여주는데 부작합함.

-변동 계수 = 표준편차 / 평균

In [21]:
x1 = np.array([1, 2, 3, 4, 5])
x1

array([1, 2, 3, 4, 5])

In [19]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


In [24]:
print(stats.variation(x1))
print(stats.variation(x2))

0.47140452079103173
0.4714045207910317


In [25]:
print(np.std(x1, ddof =1) / np.mean(x1))
print(np.std(x2, ddof =1) / np.mean(x2))

0.5270462766947299
0.5270462766947299


스케일링
-둘 이상의 변수의 값을 상대적으로 비교할 때 사용

In [26]:
import numpy as np
import pandas as pd

In [27]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

In [28]:
x1

array([1, 2, 3, 4, 5])

In [29]:
x2

array([10, 20, 30, 40, 50])

In [31]:
# Standard Scaling
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [32]:
# Min-max Scaling
z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())
print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


# 데이터 표준화 하기

### 데이터 프레임 만들기

In [27]:
import pandas as pd
# del pandas

In [28]:
X = pd.DataFrame(
    {"X1" : [1,2,3,4,5],
     "X2" : [10,20,30,40,50]
    }
            )

### scikit learn을 활용한 데이터 표준화 하기

In [13]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp310-cp310-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
   ------- -------------------------------- 1.6/8.9 MB 10.5 MB/s eta 0:00:01
   ------------ --------------------------- 2.9/8.9 MB 8.0 MB/s eta 0:00:01
   ------------------------------------- -- 8.4/8.9 MB 14.5 MB/s eta 0:00:01
   ---------------------------------------- 8.9/8.9 MB 13.8 MB/s  0:00:00
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn

   ------------- -------------------------- 1/3 [joblib]
 

In [14]:
!dir

 C 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: 1C12-191A

 C:\kpmg_ggdrive\kpmg_7th_lab\statistics_ex 디렉터리

2025-10-16  오전 09:34    <DIR>          .
2025-10-16  오전 09:34    <DIR>          ..
2025-10-14  오후 04:18             4,895 .gitignore
2025-10-15  오후 04:01    <DIR>          .ipynb_checkpoints
2025-10-14  오후 04:18                39 README.md
2025-10-15  오전 09:26             1,063 test.ipynb
2025-10-15  오후 04:01                72 Untitled.ipynb
2025-10-15  오전 11:28             8,648 [실습1-1]통계_대표통계.ipynb
2025-10-16  오전 09:34            13,851 [실습1-2]통계_산포통계.ipynb
               6개 파일              28,568 바이트
               3개 디렉터리  423,141,535,744 바이트 남음


In [29]:
# MInMaxscaler 메모리에 로딩
from sklearn.preprocessing import MinMaxScaler

In [31]:
#MinMaxScaler 객체 생성
scaler = MinMaxScaler()
scaled = scaler.fit_transform(x)

In [32]:
scaled

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [None]:
# docstrin 불러오기 : Shift _ lab
# 자동완성 : Tab

In [33]:
pd.DataFrame(scaled, columns=["X1", "X2"])

Unnamed: 0,X1,X2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


## StnadardScaler로 표준화 하기

In [34]:
# 표준 정규분포 : 
from sklearn.preprocessing import StandardScaler

In [37]:
ss_scaler = StandardScaler()
S = ss_scaler.fit_transform(x)
pd.DataFrame(S, columns=['X1', 'X2'])

Unnamed: 0,X1,X2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


## 범위와 사분위 범위 계산하기

In [39]:
import numpy as np

In [43]:
# np.random.normal(평균, 편차, size = 갯수)
x = np.random.normal(100, 20, size = 1000)

In [41]:
# x(소문자) : 컬럼이 1개일 경우
# X(대문자) : 컬럼이 2개이상일 경우

In [42]:
x

array([ 99.73316767,  89.17294018, 114.35914306, 105.05087529,
       109.90393715,  91.14776486,  76.44250507,  86.99410494,
       133.45403381, 110.66315372,  77.92421476, 106.80074646,
       120.95695298, 107.04374992,  78.97747801, 137.43382623,
       110.60401616,  60.21142428, 102.97170527,  63.7959945 ,
       107.55712513, 125.7919902 , 113.1724669 ,  86.99545472,
       100.92623924, 126.15042581,  84.85320357, 107.63067461,
        77.51887161, 120.00008041,  60.57437489, 126.56665155,
       117.62579842, 100.59182489,  77.08773013, 104.4698853 ,
       128.54035482, 101.24429451,  99.27947886, 111.15898423,
       107.98574521, 108.03861813, 119.19005426, 124.30344498,
        99.40646405, 114.77729264, 150.349114  , 124.68559159,
       114.63455248, 112.32269907, 106.05069464,  97.42142269,
       102.48110436, 104.91255421, 105.35508285, 124.9794019 ,
       100.49144396, 106.90861855,  82.12537873, 105.27334048,
        89.62943431,  76.26780561,  96.31123812,  83.87

### 범위 계산

- m : 문자 셀로 변환
- y : 코드셀로 변환

In [46]:
print(np.ptp(x))
print(np.max(x) - np.min(x))

130.70820136128887
130.70820136128887


In [55]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))

27.31198700836076


In [57]:
from scipy import stats as st
print(st.iqr(x))

27.31198700836076
