In [10]:
# pip install missingno

In [38]:
import pandas as pd
import numpy as np
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
%matplotlib inline

In [39]:
data = pd.read_csv('data_clear.csv', index_col=0)
for col in data.select_dtypes('object').columns:
    if col not in ['user']:
        lbl = LabelEncoder()
        data[col] = lbl.fit_transform(data[col].astype(str))

In [40]:
# replace infinity with np.nan
data = data.replace([np.inf, -np.inf], np.nan)

In [41]:
def missing_values_table(df):
    mis_val = df.isnull().sum() # 总缺失值
    mis_val_percent = 100 * df.isnull().sum() / len(df) # 缺失值比例
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1) # 缺失值制成表格
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0:'Missing Values',
                                                               1:'% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values',ascending=False).round(1)
    # 缺失值比例列由大到小排序
    
    print('Your selected dataframe has {} columns.\nThere are {} columns that have missing values.'.format(df.shape[1], mis_val_table_ren_columns.shape[0]))
    # 打印缺失值信息
    
    return mis_val_table_ren_columns

missing_values_table(data)

Your selected dataframe has 747 columns.
There are 368 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
user_week_2_amount_count,27211,56.9
user_type1_45a1168437c708ff_min_day,26890,56.3
user_type1_45a1168437c708ff_amount_sum,26890,56.3
user_type1_45a1168437c708ff_amount_count,26890,56.3
user_type1_674e8d5860bc033d_amount_sum,26640,55.8
...,...,...
acc_card_ratio_x,2575,5.4
login_cnt_period2_login_cnt_ratio_x,1025,2.1
using_time_add_balance_avg,294,0.6
city_level_label_mean,155,0.3


In [42]:
data.isnull().any()[0]

False

In [43]:
cols_with_null = []
for col in data.columns:
    if data[col].isnull().any() == True:
        cols_with_null.append(col)
        
cols_with_null.remove('label')

In [44]:
for col in cols_with_null:
    temp = data[col].values.reshape(-1,1)
    imp_0 = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=-1)  #实例化，填充常数0，填充常数需strategy与fill_value一同使用
    imp_0=imp_0.fit_transform(temp)     #fit_transform一步完成调取结果
    data[col]=imp_0                   #填充好的数据传回到 data['Age']列
#     data[col].isnull().sum()

In [45]:
data.tail()

Unnamed: 0,user,label,login_cnt_period1_x,province_x,city_x,sex_sex_x,sex_provider_x,sex_level_x,sex_verified_x,sex_regist_type_x,...,city_op2_cnt_mean,city_service1_cnt_mean,city_service1_amt_mean,city_agreement_total_mean,city_login_cnt_avg_mean,city_balance_avg_mean,city_balance2_mean,city_product1_amount_mean,city_product3_amount_mean,city_product7_cnt_mean
47777,Train_29906,,0.007333,17,121,0,0,2,0,3,...,0.022721,0.000458,0.000487,0.318445,0.021075,0.466463,0.407831,0.070281,0.024096,0.055735
47778,Train_32134,,0.0,26,156,0,0,1,1,1,...,0.02784,0.003364,0.005163,0.296551,0.021426,0.49099,0.410059,0.055066,0.016887,0.06011
47779,Train_28522,,0.031409,17,177,0,2,2,0,3,...,0.020835,0.000763,0.000813,0.364546,0.023398,0.482995,0.405276,0.134841,0.143216,0.055276
47780,Train_46346,,0.00789,21,71,0,0,2,0,1,...,0.022749,0.001026,0.001298,0.317989,0.020228,0.448649,0.388063,0.06006,0.004505,0.06006
47781,Train_08197,,0.011112,28,291,0,0,1,0,1,...,0.024291,0.000892,0.002189,0.295974,0.020448,0.427431,0.390221,0.058535,0.02734,0.053021


In [47]:
data.to_csv('data_clear_nonull.csv')