In [3]:
import pandas as pd
import numpy as np
from scipy import stats

In [4]:
def fill_fre_top_5(x):  # 频数前五位的函数：若不足5，则nan补齐
    if (len(x)) <= 5:
        new_array = np.full(5, np.nan)
        new_array[0:len(x)] = x
        return new_array

In [25]:
def eda_analysis_v1(df = None, missSet = [np.nan, 9999999999, -999999]):
  # 1. Count
  count_un = df.apply(lambda x: x.nunique()) # nunique 非重复值， unique 重复值
  count_un = count_un.to_frame('count')
    
  # 2. Count zero
  count_zero = df.apply(lambda x: np.sum(x == 0))
  count_zero = count_zero.to_frame('count_zero')
  
  # 3. Mean
  df_mean = df.apply(lambda x: np.mean(x[~np.isin(x, missSet)]))
  df_mean = df_mean.to_frame('mean')
  
  # 4. Median
  df_median = df.apply(lambda x: np.median(x[~np.isin(x, missSet)]))
  df_median = df_median.to_frame('median')
  
  # 5. Mode
  df_mode = df.apply(lambda x: stats.mode(x[~np.isin(x, missSet)])[0])
  df_mode = df_mode.to_frame('mode')
  
  # 6. Mode count
  df_mode_count = df.apply(lambda x: np.bincount(x[~np.isin(x, missSet)])[1])
  df_mode_count = df_mode_count.to_frame('mode_count')
  
  df_mode_percentage = df_mode_count / df.shape[0]
  df_mode_percentage.columns = ['mode_percentage']
  
  # 7. Min
  df_min = df.apply(lambda x: np.min(x[~np.isin(x, missSet)]))
  df_min = df_min.to_frame('min')
  
  # 8. Max
  df_max = df.apply(lambda x: np.max(x[~np.isin(x, missSet)]))
  df_max = df_max.to_frame('max')

  # 9. Quantile
  json_fre_name = {}
  json_fre_count = {}
  for i, name in enumerate(df.columns):
    index_name = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].index.values
    index_name = fill_fre_top_5(index_name)
    json_fre_name[name] = index_name
    
    value_count = df[name][~np.isin(df[name], missSet)].value_counts().iloc[0:5, ].values
    value_count = fill_fre_top_5(value_count)
    json_fre_count[name] = value_count
    
  df_fre_name = pd.DataFrame(json_fre_name)[df.columns].T
  df_fre_count = pd.DataFrame(json_fre_count)[df.columns].T
  
  df_fre = pd.concat([df_fre_name, df_fre_count], axis = 1)
  df_fre.columns = ['value1', 'value2', 'value3', 'value4', 'value5', 'freq1', 'freq2', 'freq3', 'freq4', 'freq5']
  
  # 10. miss value count
  df_miss = df.apply(lambda x: np.sum(np.isin(x, missSet)))
  df_miss = df_miss.to_frame('freq_miss')
  
  # 11. 合并
  df_eda_summary = pd.concat(
    [count_un, count_zero, df_mean, df_median, df_mode, df_mode_count, df_mode_percentage, df_min, df_max, df_fre, df_miss],
    axis = 1
  )

  return df_eda_summary  
  
  
  

In [27]:
import timeit

df = pd.read_csv('./data/train.csv')

start = timeit.default_timer()
eda_analysis_v1(df.iloc[:, 0:3], [np.nan, 9999999999, -999999])
# a = df.iloc[:, 0:3].apply(lambda x: x.nunique()) 
# print(a)
end = timeit.default_timer()
print('EDA Running Time: {0:.2f} seconds'.format(end - start))


EDA Running Time: 0.03 seconds
