In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

document1 = "I have a pen, I have an apple, apple pen."
document2 = "I have a pen, I have pineapple, pineapple pen."

tv = TfidfVectorizer(lowercase=False, token_pattern='\w+',
                     norm='l1', smooth_idf=False)
model = tv.fit_transform([document1, document2])
print(pd.DataFrame(model.toarray(), columns=tv.get_feature_names_out()))
#           I         a        an     apple      have       pen  pineapple
# 0  0.165571  0.082785  0.140168  0.280335  0.165571  0.165571   0.000000
# 1  0.192561  0.096281  0.000000  0.000000  0.192561  0.192561   0.326035

In [2]:

import numpy as np
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

X = np.array([
    [1, '周六', '吃饭', '晴天', '轻松', '清零', '精彩'],
    [6, '周六', '逛街', '晴天', '轻松', '平缓', '无聊'],
    [10, '周六', '-', '雨天', '轻松', '严峻', '无聊'],
    [13, '周六', '逛街', '晴天', '正常', '清零', '精彩'],
])
y = np.array(['是', '是', '否', '否'])
print(LabelBinarizer().fit_transform(y).squeeze())  # 标记二值化
# [1 1 0 0]

enc = OneHotEncoder()
print(enc.fit_transform(X[:, 1:7]).toarray())  # 对6个类别特征采用独热编码
# [[1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
#  [1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
#  [1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
#  [1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1.]]

print(enc.get_feature_names_out())  # 独热编码对应的原始特征
# ['x0_周六' 'x1_-' 'x1_吃饭' 'x1_逛街' 'x2_晴天' 'x2_雨天' 'x3_正常' 'x3_轻松' 'x4_严峻'
#  'x4_平缓' 'x4_清零' 'x5_无聊' 'x5_精彩']

[1 1 0 0]
[[1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
 [1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1.]]
['x0_周六' 'x1_-' 'x1_吃饭' 'x1_逛街' 'x2_晴天' 'x2_雨天' 'x3_正常' 'x3_轻松' 'x4_严峻'
 'x4_平缓' 'x4_清零' 'x5_无聊' 'x5_精彩']


In [3]:

import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

X = np.array([
    [1, '周六', '吃饭', '晴天', '轻松', '清零', '精彩', 25.2, 0.5],
    [6, '周六', '逛街', '晴天', '轻松', '平缓', '无聊', np.nan, 2.0],
    [10, '周六', '-', '雨天', '轻松', '严峻', '无聊', 32.6, 8.2],
    [13, '周六', '逛街', '晴天', '正常', '清零', '精彩', 36.4, 9.8],
])

imp_mean = SimpleImputer(strategy='mean')
print(imp_mean.fit_transform(X[:, [7]]))  # 用均值填充
# [[25.2]
#  [31.4]
#  [32.6]
#  [36.4]]

imp_median = SimpleImputer(strategy='median')
print(imp_median.fit_transform(X[:, [7]]))  # 用中位数填充
# [[25.2]
#  [32.6]
#  [32.6]
#  [36.4]]

imp_frequent = SimpleImputer(missing_values='-', strategy='most_frequent')
print(imp_frequent.fit_transform(X[:, [2]].astype('object')))  # 用众数填充
# [['吃饭']
#  ['逛街']
#  ['逛街']
#  ['逛街']]

# 回归器默认采用BayesianRidge
# 其它可选DecisionTreeRegressor ExtraTreesRegressor KNeighborsRegressor
imp_iter = IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=2))
print(imp_iter.fit_transform(X[:, [0, 7, 8]]))
# [[ 1.  25.2  0.5]
#  [ 6.  28.9  2. ]
#  [10.  32.6  8.2]
#  [13.  36.4  9.8]]

[[25.2]
 [31.4]
 [32.6]
 [36.4]]
[[25.2]
 [32.6]
 [32.6]
 [36.4]]
[['吃饭']
 ['逛街']
 ['逛街']
 ['逛街']]
[[ 1.  25.2  0.5]
 [ 6.  28.9  2. ]
 [10.  32.6  8.2]
 [13.  36.4  9.8]]


In [4]:

import numpy as np
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, scale

X = np.array([
    [1, '周六', '吃饭', '晴天', '轻松', '清零', '精彩', 25.2, 0.5],
    [6, '周六', '逛街', '晴天', '轻松', '平缓', '无聊', 27.4, 2.0],
    [10, '周六', '学习', '雨天', '轻松', '严峻', '无聊', 32.6, 8.2],
    [13, '周六', '逛街', '晴天', '正常', '清零', '精彩', 36.4, 9.8],
])

print(MinMaxScaler().fit_transform(X[:, [0, 7, 8]]))  # 最大值变成1 同时 最小值变成0
# [[0.         0.         0.        ]
#  [0.41666667 0.19642857 0.16129032]
#  [0.75       0.66071429 0.82795699]
#  [1.         1.         1.        ]]

print(MaxAbsScaler().fit_transform(X[:, [0, 7, 8]]))  # 最大值变成1
# [[0.07692308 0.69230769 0.05102041]
#  [0.46153846 0.75274725 0.20408163]
#  [0.76923077 0.8956044  0.83673469]
#  [1.         1.         1.        ]]

x = scale(X[:, [0, 7, 8]])
print(x)
# [[-1.44444444 -1.1861146  -1.17034706]
#  [-0.33333333 -0.68429689 -0.79077504]
#  [ 0.55555556  0.50181772  0.77812264]
#  [ 1.22222222  1.36859377  1.18299946]]

print(x.mean(axis=0), x.std(axis=0))  # 均值为0 标准差为1
# [ 5.55111512e-17  2.22044605e-16 -5.55111512e-17] [1. 1. 1.]

[[0.         0.         0.        ]
 [0.41666667 0.19642857 0.16129032]
 [0.75       0.66071429 0.82795699]
 [1.         1.         1.        ]]
[[0.07692308 0.69230769 0.05102041]
 [0.46153846 0.75274725 0.20408163]
 [0.76923077 0.8956044  0.83673469]
 [1.         1.         1.        ]]
[[-1.44444444 -1.1861146  -1.17034706]
 [-0.33333333 -0.68429689 -0.79077504]
 [ 0.55555556  0.50181772  0.77812264]
 [ 1.22222222  1.36859377  1.18299946]]
[ 5.55111512e-17  2.22044605e-16 -5.55111512e-17] [1. 1. 1.]


In [5]:

import numpy as np
from sklearn.feature_selection import VarianceThreshold

# 对6个离散类别特征采用了独热编码
X = np.array([
    [1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.],
    [1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0.],
    [1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0.],
    [1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1.]
])
print(X.shape)
# (4, 13)

# 第1列由特征“时间”而来 四个样本都取值“周六” 独热编码后都是1 方差为0
XX = VarianceThreshold(threshold=0.01).fit_transform(X)
print(XX)
# [[1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
#  [0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
#  [0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
#  [0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1.]]

print(XX.shape)
# (4, 12)

(4, 13)
[[1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
 [0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1.]]
(4, 12)
