In [1]:
# 데이터 변환 ( Transformation ) 
# : 여러 형태로 표현된 데이터 값을 다양한 분석 방법론에 적용하기 위해 원시 형태에서 다른 형식으로 바꾸는 과정 
# => 주어진 목적 기반의 올바른 결과 획득을 위하여 원시 데이터를 데이터 분석에 용이하도록 형태 변환 

# 변환 목적 및 특징 
# 빠른 특성 파악 
# : 데이터의 특성을 빠르게 파악 가능 
# : 파생변수 생성 및 단순화를 통한 결과 리포팅 등 활용

# 분석 알고리즘 적용 
# : 변수 간 범위가 다를 경우 알고리즘 적용 시 영향력의 차이가 발생함
# : 올바른 학습을 위해 다른 범위를 지닌 변수들의 변환 필요 

In [2]:
# 구간화 ( Binning ) 
# : 연속형 데이터를 구간으로 구별하여 범주화 형태 변환
# 지정 길이 기반 구간 정의 
# : 사용자 기준으로 데이터 범위의 간격을 구분하여 관측치를 나누는 방안 

# 분포 기반 구간 정의 
# : 관측치가 각 구간 내 동일한 개수로 구분되도록 나누는 방안

# 정규화 ( Normalization ) 
# : 데이터 탐색 및 기계학습 적용을 위한 연속형 변수 변환
# 최대 - 최소 정규화 
# : 데이터 구간을 0에서 1사이로 변환
# : 특정 데이터의 위치 파악 

# Z - 점수 정규화 
# : 0을 중심으로 양쪽으로 데이터 분포시킴 
# : 특정 데이터가 평균과 얼마나 떨어져 있는지 파악 

In [4]:
import numpy as np
import pandas as pd

category_enum = ('사설', '사회', '공학', '증권', '부동산', '경제', '정치', '스포츠', '연예')
journal_enum = ('A일보', 'B일보', 'C일보', 'D일보', 'E일보')

np.random.seed(1)
df_data = {
    'category' : [ category_enum[x] for x in np.random.randint(0, 9, 17734)],
    'journal' : [ journal_enum[x] for x in np.random.randint(0, 5, 17734)],
    'num_click' : [ ((x+5)**6//30)+9 if (x+5)**6 < 340000 else ((x+5)**6//1)  for x in np.random.randn(17734) ]
}

df = pd.DataFrame(df_data)

In [10]:
click_copy = df.copy()

In [5]:
cancer = pd.read_csv("./data/wdbc.data", encoding="cp949")
cancer.head(15)

Unnamed: 0,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
5,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
6,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
7,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
8,84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
9,845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,...,19.19,33.88,123.8,1150.0,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452


In [6]:
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 32 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   842302    568 non-null    int64  
 1   M         568 non-null    object 
 2   17.99     568 non-null    float64
 3   10.38     568 non-null    float64
 4   122.8     568 non-null    float64
 5   1001      568 non-null    float64
 6   0.1184    568 non-null    float64
 7   0.2776    568 non-null    float64
 8   0.3001    568 non-null    float64
 9   0.1471    568 non-null    float64
 10  0.2419    568 non-null    float64
 11  0.07871   568 non-null    float64
 12  1.095     568 non-null    float64
 13  0.9053    568 non-null    float64
 14  8.589     568 non-null    float64
 15  153.4     568 non-null    float64
 16  0.006399  568 non-null    float64
 17  0.04904   568 non-null    float64
 18  0.05373   568 non-null    float64
 19  0.01587   568 non-null    float64
 20  0.03003   568 non-null    float6

In [7]:
cancer_copy = cancer.copy()

In [8]:
cancer_copy.describe()

Unnamed: 0,842302,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
count,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,...,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0,568.0
mean,30423820.0,14.120491,19.305335,91.914754,654.279754,0.096321,0.104036,0.088427,0.048746,0.181055,...,16.25315,25.691919,107.125053,878.578873,0.132316,0.253541,0.271414,0.114341,0.289776,0.083884
std,125124600.0,3.523416,4.288506,24.285848,351.923751,0.014046,0.052355,0.079294,0.038617,0.027319,...,4.82232,6.141662,33.474687,567.846267,0.022818,0.156523,0.207989,0.065484,0.061508,0.018017
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869222.5,11.6975,16.1775,75.135,420.175,0.08629,0.064815,0.02954,0.02031,0.1619,...,13.01,21.095,84.1025,514.975,0.1166,0.1469,0.114475,0.06473,0.25035,0.071412
50%,906157.0,13.355,18.855,86.21,548.75,0.095865,0.092525,0.0614,0.033455,0.1792,...,14.965,25.425,97.655,685.55,0.1313,0.21185,0.22655,0.09984,0.28205,0.080015
75%,8825022.0,15.78,21.8025,103.875,782.625,0.1053,0.1304,0.12965,0.07373,0.195625,...,18.7675,29.7575,125.175,1073.5,0.146,0.3376,0.3814,0.161325,0.317675,0.092065
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [11]:
click_copy.describe()

Unnamed: 0,num_click
count,17734.0
mean,1033.469155
std,8596.249147
min,9.0
25%,223.0
50%,521.5
75%,1118.0
max,635158.0


In [12]:
# 지정 길이 기반 구간화
# 클릭 수 기반으로 기사 관심도 정의 기반 범주형 변환 (3개 구간 설정)

#c cut 함수 활용 기반 구간 설정 (사용자 기준)
bins = [0, 100, 500, np.max(click_copy['num_click'])]
names = ["low", "medium", "high"]

click_copy['pref'] = pd.cut(click_copy['num_click'], bins, labels= names)

In [13]:
#구간 결과 확인 
click_copy['pref'].value_counts()

pref
high      9090
medium    6726
low       1918
Name: count, dtype: int64

In [14]:
click_copy.head(15)

Unnamed: 0,category,journal,num_click,pref
0,경제,D일보,354.0,medium
1,연예,D일보,600.0,high
2,경제,E일보,869.0,high
3,사설,E일보,928.0,high
4,사설,A일보,2396.0,high
5,사회,E일보,320.0,medium
6,스포츠,E일보,345.0,medium
7,정치,D일보,831.0,high
8,공학,E일보,241.0,medium
9,부동산,A일보,818.0,high


In [15]:
# 분포 기반 구간화
# qcut의 경우 동일한 관측치가 배치되도록 구간을 분할
# 따라서 구간수로 나누게 됨
# 구간수 = n
n = 3 
click_copy['pref_qcut'] = pd.qcut(click_copy['num_click'], n, labels=names)

In [16]:
print(click_copy['pref_qcut'].value_counts())

pref_qcut
low       5929
high      5907
medium    5898
Name: count, dtype: int64


In [17]:
# 만일 qcut 실행 시 주로 발생하는 오류
# - 데이터가 극단치의 Skewed 한 경우 발생
# 데이터의 각 구간은 동일할 수 없기 때문
# 그러한 경우에는 분포 기반 구간화보다 사용자 기준 기반 범주화 혹은 구간 개수 변화 고려 

In [18]:
# 정규화
# 기계학습 알고리즘의 각 변수별 영향력의 차이를 조장할 필요 존재함 
# 각 변수별 범위가 다를 경우 학습 안정성이 떨어질 수 있으므로 간격 조정 필요

# 1. 최대 - 최소 정규화 : 변수 안의 값을 [0,1] 구간의 값을 갖도록 구성하고 데이터 군 내에서 특정 데이터가 가지는 위치를 보고자 할 때 사용
# 2. z-score 정규화 : 특정 데이터가 평균을 기준으로 얼마나 떨어져 있는지 파악하고 측정 스케일이 다른 경우의 데이터를 비교하는데 사용

# 최대-최소 정규화 (Min_Max Scaling)
# 변수 값을 0에서 1사이의 값으로 변환
# 함수는 sklearn 라이브러리에 쉽게 구현되어있음

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# 데이터 Copy
click_copy = df.copy()

#변환 
click_copy['minmax_values'] = scaler.fit_transform(click_copy[['num_click']])
click_copy.head(15)

Unnamed: 0,category,journal,num_click,minmax_values
0,경제,D일보,354.0,0.000543
1,연예,D일보,600.0,0.00093
2,경제,E일보,869.0,0.001354
3,사설,E일보,928.0,0.001447
4,사설,A일보,2396.0,0.003758
5,사회,E일보,320.0,0.00049
6,스포츠,E일보,345.0,0.000529
7,정치,D일보,831.0,0.001294
8,공학,E일보,241.0,0.000365
9,부동산,A일보,818.0,0.001274


In [6]:
click_copy.describe()

Unnamed: 0,num_click,minmax_values
count,17734.0,17734.0
mean,1033.469155,0.001613
std,8596.249147,0.013534
min,9.0,0.0
25%,223.0,0.000337
50%,521.5,0.000807
75%,1118.0,0.001746
max,635158.0,1.0


In [8]:
# Z-score 정규화 ( Z - Score Normalization ) : 표준화라고 불리우기도 함
# 변수를 평균이 0이고 표준편차가 1인 표준정규분포로 변환
# 함수는 sklearn 라이브러리에 쉽게 구현되어있음

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

# 변환
click_copy['std_values'] = std_scaler.fit_transform(click_copy[['num_click']])
click_copy.head(15)

Unnamed: 0,category,journal,num_click,minmax_values,std_values
0,경제,D일보,354.0,0.000543,-0.079045
1,연예,D일보,600.0,0.00093,-0.050427
2,경제,E일보,869.0,0.001354,-0.019133
3,사설,E일보,928.0,0.001447,-0.01227
4,사설,A일보,2396.0,0.003758,0.158507
5,사회,E일보,320.0,0.00049,-0.083
6,스포츠,E일보,345.0,0.000529,-0.080092
7,정치,D일보,831.0,0.001294,-0.023554
8,공학,E일보,241.0,0.000365,-0.09219
9,부동산,A일보,818.0,0.001274,-0.025066


In [9]:
# 데이터 요약 
click_copy.describe()
# 이상치 제거 후 수치형 데이터를 변환하여 모델링 적용 필요 

Unnamed: 0,num_click,minmax_values,std_values
count,17734.0,17734.0,17734.0
mean,1033.469155,0.001613,8.414006e-18
std,8596.249147,0.013534,1.000028
min,9.0,0.0,-0.1191797
25%,223.0,0.000337,-0.09428438
50%,521.5,0.000807,-0.05955895
75%,1118.0,0.001746,0.009833734
max,635158.0,1.0,73.76966
