In [354]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [355]:
def getFeatures(factor):
    scaler = StandardScaler(with_mean=False, with_std=True)
    df_scaled = scaler.fit_transform(factor)
    pca = PCA(n_components=1)
    transformed_data = pca.fit_transform(df_scaled)
    print(pca.explained_variance_ratio_)
    return transformed_data

In [356]:
df = pd.read_excel('./handledData.xlsx')
print(df)

factor_R = df[["last_mp_days"]]
factor_F = df[['consume_num_session12', 'consume_num_session6', 'consume_num_session3',
               'consume_num_session', 'six_bill_num', 'six_cycle_mp_num', 'epp_nbr_12m'
               ]]
factor_M = df[['six_bill_avg_amt', 'consume_amt_session12', 'consume_amt_session6',
               'consume_amt_session3', 'consume_amt_session', 'six_cycle_mp_avg_amt'
               ]]
factor_N = df[['six_bill_low_repay_num', 'six_bill_avg_debt_rate']]

      Unnamed: 0    id  y  city  mar_status  age  xaccount_age  cred_limit  \
0              0     2  0    52           0   32            10       20000   
1              1    17  0    35           0   34            17       28500   
2              2    22  0    40           0   28            17        5000   
3              3    27  0    21           2   26            19       60000   
4              4    29  0     1           1   36           138       46000   
...          ...   ... ..   ...         ...  ...           ...         ...   
9581        9581  9984  1    89           0   34            22        2000   
9582        9582  9989  1   104           0   53             7        2250   
9583        9583  9990  1    16           0   34            15       50000   
9584        9584  9993  1    72           0   38             7       30000   
9585        9585  9996  1   125           0   32            12       30000   

      this_bill_rate  this_bill_mp  ...  consume_num_session6  

In [357]:
feature_R = getFeatures(factor_R)
feature_F = getFeatures(factor_F)
feature_M = getFeatures(factor_M)
feature_N = getFeatures(factor_N)

[1.]
[0.53292194]
[0.71817122]
[0.51816634]


In [358]:
feature1 = pd.DataFrame(feature_R, columns=['feature'])
feature2 = pd.DataFrame(feature_F, columns=['feature'])
feature3 = pd.DataFrame(feature_M, columns=['feature'])
feature4 = pd.DataFrame(feature_N, columns=['feature'])

In [359]:
quantiles = feature1['feature'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1])
quantiles

0.0   -1.386004
0.2   -1.351356
0.4    0.752349
0.6    0.752349
0.8    0.752349
1.0    0.752349
Name: feature, dtype: float64

In [360]:
bins = [-np.inf, quantiles[0.2], quantiles[0.4], quantiles[0.6], quantiles[0.8], np.inf]
feature1['quintile'] = np.digitize(feature1['feature'], bins)
feature1['quintile'] = feature1['quintile'].map({1: 5, 2: 4, 3: 3, 4: 2, 5: 1})
feature1

Unnamed: 0,feature,quintile
0,-1.383010,5
1,0.752349,1
2,0.752349,1
3,0.752349,1
4,-1.382582,5
...,...,...
9581,-1.366755,5
9582,0.752349,1
9583,-1.375952,5
9584,-1.373385,5


In [361]:
quantiles = feature2['feature'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1])
bins = [-np.inf, quantiles[0.2], quantiles[0.4], quantiles[0.6], quantiles[0.8], np.inf]
feature2['quintile'] = np.digitize(feature2['feature'], bins)
feature2

Unnamed: 0,feature,quintile
0,-0.497415,3
1,1.074445,5
2,-0.150728,4
3,2.111500,5
4,0.565410,5
...,...,...
9581,-0.781247,2
9582,-0.786455,2
9583,11.664120,5
9584,-0.602576,3


In [362]:
quantiles = feature3['feature'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1])
bins = [-np.inf, quantiles[0.2], quantiles[0.4], quantiles[0.6], quantiles[0.8], np.inf]
feature3['quintile'] = np.digitize(feature3['feature'], bins)
feature3

Unnamed: 0,feature,quintile
0,-0.473913,3
1,1.711420,5
2,0.473500,4
3,6.229929,5
4,3.616218,5
...,...,...
9581,-1.243306,1
9582,-1.122359,1
9583,2.013792,5
9584,0.317023,4


In [363]:
quantiles = feature4['feature'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1])
bins = [-np.inf, quantiles[0.2], quantiles[0.4], quantiles[0.6], quantiles[0.8], np.inf]
feature4['quintile'] = np.digitize(feature4['feature'], bins)
feature4['quintile'] = feature4['quintile'].map({1: 5, 2: 4, 3: 3, 4: 2, 5: 1})
feature4

Unnamed: 0,feature,quintile
0,-0.056785,2
1,-0.491470,4
2,-0.282813,3
3,-0.130009,3
4,-0.280136,3
...,...,...
9581,-0.374761,3
9582,-0.446775,3
9583,-0.194359,3
9584,1.127066,1


In [364]:
# 计算各指标的熵值
def calculate_entropy(series):
    n = len(series)
    probabilities = series / series.sum(axis=0)
    entropy = -sum(probabilities * np.log(probabilities + 1e-10)) / np.log(n) # 避免对数为负无穷
    return entropy

In [365]:
scaler = MinMaxScaler()
feature1['feature'] = scaler.fit_transform(feature1['feature'].to_numpy().reshape(-1, 1))
feature2['feature'] = scaler.fit_transform(feature2['feature'].to_numpy().reshape(-1, 1))
feature3['feature'] = scaler.fit_transform(feature3['feature'].to_numpy().reshape(-1, 1))
feature4['feature'] = scaler.fit_transform(feature4['feature'].to_numpy().reshape(-1, 1))
feature1

Unnamed: 0,feature,quintile
0,0.001400,5
1,1.000000,1
2,1.000000,1
3,1.000000,1
4,0.001600,5
...,...,...
9581,0.009002,5
9582,1.000000,1
9583,0.004701,5
9584,0.005901,5


In [366]:
entropies_R = calculate_entropy(feature1['feature'])
entropies_F = calculate_entropy(feature2['feature'])
entropies_M = calculate_entropy(feature3['feature'])
entropies_N = calculate_entropy(feature4['feature'])

In [367]:
difference_coefficients = 1 - entropies_R + 1 - entropies_F + 1 - entropies_M + 1 - entropies_N
weights_R = (1 - entropies_R) / difference_coefficients
weights_F = (1 - entropies_F) / difference_coefficients
weights_M = (1 - entropies_M) / difference_coefficients
weights_N = (1 - entropies_N) / difference_coefficients

In [368]:
weights_R, weights_F, weights_M, weights_N

(0.21665431968107726,
 0.3256325238762889,
 0.3648982601513716,
 0.09281489629126166)