## 1.3 数据变换 

### 导入所需库

In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### 1.3.1 数据标准化

In [3]:
# 样例数据
data = np.array([[100, 0.001],
                 [8, 0.05],
                 [50, 0.005],
                 [88, 0.07],
                 [4, 0.1]])

#### 归一化

In [4]:
from sklearn.preprocessing import MinMaxScaler

# 实例化归一化器
scaler = MinMaxScaler()

# 进行归一化
normalized_data = scaler.fit_transform(data)

print(normalized_data)

[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


#### 标准化

In [5]:
from sklearn.preprocessing import StandardScaler

# 实例化标准化器
scaler = StandardScaler()

# 进行标准化
standardized_data = scaler.fit_transform(data)

print(standardized_data)

[[ 1.26398112 -1.16389967]
 [-1.06174414  0.12639634]
 [ 0.         -1.05856939]
 [ 0.96062565  0.65304778]
 [-1.16286263  1.44302493]]


#### 正则化

In [6]:
from sklearn.preprocessing import Normalizer

# 实例化正则化器
normalizer = Normalizer()

# 进行正则化
regularized_data = normalizer.fit_transform(data)

print(regularized_data)

[[1.00000000e+00 1.00000000e-05]
 [9.99980469e-01 6.24987793e-03]
 [9.99999995e-01 9.99999995e-05]
 [9.99999684e-01 7.95454294e-04]
 [9.99687646e-01 2.49921912e-02]]


### 1.3.2 数据编码

In [2]:
# 样例数据
data = pd.DataFrame({'Grade': ['Low', 'Medium', 'High', 'Medium', 'Low']})
print(data)

    Grade
0     Low
1  Medium
2    High
3  Medium
4     Low


#### 顺序编码

In [10]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# 应用顺序编码
encoder = OrdinalEncoder()
data['Grade_encoded'] = encoder.fit_transform(data[['Grade']])
print(data)

    Grade  Grade_encoded
0     Low            1.0
1  Medium            2.0
2    High            0.0
3  Medium            2.0
4     Low            1.0


#### 独热编码

In [16]:
from sklearn.preprocessing import OneHotEncoder

# 应用独热编码
encoder = OneHotEncoder(sparse=False)
data_encoded = encoder.fit_transform(data[['Grade']])
print(data_encoded)

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


#### 频率编码

In [18]:
frequency = data['Grade'].value_counts(normalize=True)

# 应用频率编码
data['Grade_encoded'] = data['Grade'].map(frequency)
print(data)

    Grade  Grade_encoded
0     Low            0.4
1  Medium            0.4
2    High            0.2
3  Medium            0.4
4     Low            0.4


#### 目标编码

In [3]:

# 假设有目标变量
data['Target'] = [1, 2, 0, 2, 1]

# 应用目标编码
encoder = ce.TargetEncoder()
data['Grade_encoded'] = encoder.fit_transform(data['Grade'], data['Target'])
print(data)

    Grade  Target  Grade_encoded
0     Low       1       1.171630
1  Medium       2       1.313481
2    High       0       1.043870
3  Medium       2       1.313481
4     Low       1       1.171630


#### 模型编码

In [4]:
import category_encoders as ce

# 使用catboost编码
encoder = ce.CatBoostEncoder()
data['Grade_encoded'] = encoder.fit_transform(data['Grade'], data['Target'])
print(data)

    Grade  Target  Grade_encoded
0     Low       1            1.2
1  Medium       2            1.2
2    High       0            1.2
3  Medium       2            1.6
4     Low       1            1.1


### 1.3.3 分布变换

In [14]:
# 样例数据
data = {'value': [10, 100, 1000, 10000]}
df = pd.DataFrame(data)
print(df)

   value
0     10
1    100
2   1000
3  10000


#### log变换

In [7]:
df['log_transformed'] = np.log(df['value'])
print(df)

   value  log_transformed
0     10         2.302585
1    100         4.605170
2   1000         6.907755
3  10000         9.210340


#### box-cox变换

In [12]:
from scipy import stats

# Box-Cox变换
df['box_cox_value'], fitted_lambda = stats.boxcox(df['value'])

print(df)
print('Optimal lambda:', fitted_lambda)

   value  box_cox_value
0     10       2.302585
1    100       4.605170
2   1000       6.907755
3  10000       9.210341
Optimal lambda: 4.11631264713494e-09


#### yeo-johnson变换

In [17]:
from scipy.stats import johnsonsu

# Johnson变换
df['johnson_transformed'] = johnsonsu.fit(df['value'])

print(df)

   value  johnson_transformed
0     10        -2.075367e+00
1    100         9.872263e-02
2   1000         1.000000e+01
3  10000         2.830347e-09


### 1.3.4 数据抽样

In [25]:
# 样例数据
np.random.seed(0)
df = pd.DataFrame({
    'id': range(1, 101),  # 100个样本点
    'value': np.random.randn(100)  # 一些随机值
})
print(df)

     id     value
0     1  1.764052
1     2  0.400157
2     3  0.978738
3     4  2.240893
4     5  1.867558
..  ...       ...
95   96  0.706573
96   97  0.010500
97   98  1.785870
98   99  0.126912
99  100  0.401989

[100 rows x 2 columns]


#### 随机抽样

In [20]:
sample_simple = df.sample(n=10)  # 随机选择10个样本
print(sample_simple)

    id     value
19  20 -0.854096
6    7  0.950088
45  46 -0.438074
57  58  0.302472
22  23  0.864436
25  26 -1.454366
48  49 -1.613898
30  31  0.154947
59  60 -0.362741
73  74 -1.234826


#### 分层抽样

In [21]:
from sklearn.model_selection import train_test_split

# 假设df中的数据根据某个特征可以被分成几个层
df['strata'] = np.where(df['value'] > 0, 'positive', 'negative')  # 基于value列创建分层依据

# 分层抽样
# 10%的数据作为样本
stratified_sample = train_test_split(df, test_size=0.1, stratify=df['strata'])[1] 
print(stratified_sample)

    id     value    strata
14  15  0.443863  positive
24  25  2.269755  positive
16  17  1.494079  positive
35  36  0.156349  positive
59  60 -0.362741  negative
0    1  1.764052  positive
83  84 -1.536244  negative
49  50 -0.212740  negative
8    9 -0.103219  negative
78  79 -0.311553  negative


#### 整体抽样

In [23]:
df['cluster'] = df['id'] % 10  # 创建10个整体

# 整体抽样
selected_clusters = np.random.choice(df['cluster'].unique(), 2, replace=False)  # 随机选择2个整体
cluster_sample = df[df['cluster'].isin(selected_clusters)]  # 抽取这两个整体的数据
print(cluster_sample)

    id     value    strata  cluster
1    2  0.400157  positive        2
5    6 -0.977278  negative        6
11  12  1.454274  positive        2
15  16  0.333674  positive        6
21  22  0.653619  positive        2
25  26 -1.454366  negative        6
31  32  0.378163  positive        2
35  36  0.156349  positive        6
41  42 -1.420018  negative        2
45  46 -0.438074  negative        6
51  52  0.386902  positive        2
55  56  0.428332  positive        6
61  62 -0.359553  negative        2
65  66 -0.401781  negative        6
71  72  0.128983  positive        2
75  76 -0.684810  negative        6
81  82  0.900826  positive        2
85  86  1.895889  positive        6
91  92  1.222445  positive        2
95  96  0.706573  positive        6


#### 系统抽样

In [26]:
k = 10  # 每隔9个样本选择一个样本
systematic_sample = df.iloc[::k, :]  # 选择系统样本
print(systematic_sample)

    id     value
0    1  1.764052
10  11  0.144044
20  21 -2.552990
30  31  0.154947
40  41 -1.048553
50  51 -0.895467
60  61 -0.672460
70  71  0.729091
80  81 -1.165150
90  91 -0.403177
