In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import humanfriendly
from IPython.display import display
from loguru import logger as LOG
import tqdm
import time

In [2]:
mpl.rc('figure', facecolor='white')
print(mpl.rcParams['figure.facecolor'])

white


In [3]:
def display_df_info(df, name=None):
    print(f'{name or ""} {len(df)} rows')
    columns = []
    for col in df.columns:
        info = {
            'column': col, 
            'dtype': df[col].dtype,
            'memory_usage': humanfriendly.format_size(df[col].nbytes),
            'num_unique': df[col].nunique(),
            'num_null': df[col].isnull().sum(),
        }
        try:
            info['min'] = df[col].min()
            info['max'] = df[col].max()
        except (TypeError, ValueError):
            cnts = df[col].value_counts()
            info['min'] = cnts.idxmin()
            info['max'] = cnts.idxmax()
        columns.append(info)
    info = pd.DataFrame.from_records(columns)
    display(info)

In [4]:
df_train = pd.read_msgpack('data/z4_merge_train_full.msgpack')
df_validate = pd.read_msgpack('data/z4_merge_validate_full.msgpack')
df_test = pd.read_msgpack('data/z4_merge_test_full.msgpack')

In [5]:
dfs = dict(train=df_train, validate=df_validate, test=df_test)

In [6]:
raw_columns = [
    'user_id', 'merchant_id', 'coupon_id', 'discount_name',
    'date_received', 'date_received_name', 'date', 'date_name',
]
for df_name, df in dfs.items():
    columns = [x for x in raw_columns if x in df.columns]
    display_df_info(df[columns], df_name)

train 353442 rows


Unnamed: 0,column,dtype,max,memory_usage,min,num_null,num_unique
0,user_id,int64,7360952,2.83 MB,165,0,234724
1,merchant_id,int64,8854,2.83 MB,2,0,4878
2,coupon_id,object,9999,2.83 MB,1,0,7598
3,discount_name,object,95,2.83 MB,100:1,0,44
4,date_received,datetime64[ns],2016-05-31 00:00:00,2.83 MB,2016-04-01 00:00:00,0,61
5,date_received_name,object,月末周2,2.83 MB,吉周1,0,14


validate 90965 rows


Unnamed: 0,column,dtype,max,memory_usage,min,num_null,num_unique
0,user_id,int64,7360961,727.72 KB,4,0,76257
1,merchant_id,int64,8856,727.72 KB,3,0,2705
2,coupon_id,object,9999,727.72 KB,1,0,3348
3,discount_name,object,95,727.72 KB,100:1,0,41
4,date_received,datetime64[ns],2016-06-15 00:00:00,727.72 KB,2016-06-01 00:00:00,0,15
5,date_received_name,object,班周7,727.72 KB,吉周2,0,14


test 113640 rows


Unnamed: 0,column,dtype,max,memory_usage,min,num_null,num_unique
0,user_id,int64,7361024,909.12 KB,209,0,76309
1,merchant_id,int64,8856,909.12 KB,6,0,1559
2,coupon_id,object,9998,909.12 KB,100,0,2050
3,discount_name,object,95,909.12 KB,100:1,0,42
4,date_received,datetime64[ns],2016-07-31 00:00:00,909.12 KB,2016-07-01 00:00:00,0,31
5,date_received_name,object,月末周7,909.12 KB,周1,0,9


In [7]:
stats = []
compares = [('train', 'validate'), ('train', 'test'), ('validate', 'test')]
for df_name, df_base_name in compares:
    for column in ['user_id', 'merchant_id', 'coupon_id']:
        df, df_base = dfs[df_name], dfs[df_base_name]
        if column in df.columns and column in df_base.columns:
            a = set(df[column].unique())
            b = set(df_base[column].unique())
            miss = b - a
            stats.append(( df_name, df_base_name, column, len(a), len(b), len(miss), len(miss) / len(b), list(miss)[:4] ))
stats = pd.DataFrame.from_records(stats)
stats.columns = ['dataframe', 'base_dataframe', 'column', 'total', 'base_total', 'num_miss', 'miss_ratio', 'miss']
stats

Unnamed: 0,dataframe,base_dataframe,column,total,base_total,num_miss,miss_ratio,miss
0,train,validate,user_id,234724,76257,40546,0.531702,"[4, 6553604, 786439, 1703951]"
1,train,validate,merchant_id,4878,2705,302,0.111645,"[6657, 514, 4, 2053]"
2,train,validate,coupon_id,7598,3348,874,0.261051,"[12307, 6628, 1383, 10606]"
3,train,test,user_id,234724,76309,29516,0.386796,"[6684672, 524296, 6946838, 4980766]"
4,train,test,merchant_id,4878,1559,242,0.155228,"[2053, 6, 8200, 1032]"
5,train,test,coupon_id,7598,2050,1251,0.610244,"[7657, 1732, 896, 10251]"
6,validate,test,user_id,76257,76309,52206,0.684139,"[6684672, 262145, 1835014, 524296]"
7,validate,test,merchant_id,2705,1559,366,0.234766,"[4099, 6, 4105, 8202]"
8,validate,test,coupon_id,3348,2050,1063,0.518537,"[7657, 4574, 896, 2020]"


In [9]:
def split_column_name(name):
    if name in df_train.columns:
        dtype = df_train[name].dtype.name
    else:
        dtype = 'NA'
    parts = name.split('_', maxsplit=1)
    return parts[0], name, dtype

items = list(map(split_column_name, df_train.columns))
df = pd.DataFrame.from_records(items, columns=['prefix', 'name', 'dtype'])
df

Unnamed: 0,prefix,name,dtype
0,user,user_id,int64
1,merchant,merchant_id,int64
2,coupon,coupon_id,object
3,distance,distance,float64
4,date,date_received,datetime64[ns]
5,discount,discount_name,object
6,is,is_dazhe,bool
7,is,is_manjian,bool
8,discount,discount_man,float64
9,discount,discount_jian,float64


In [10]:
df.groupby(['prefix', 'dtype']).size().to_frame('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
prefix,dtype,Unnamed: 2_level_1
coupon,float64,14
coupon,object,1
date,bool,11
date,datetime64[ns],1
date,object,1
discount,float64,31
discount,object,1
distance,float64,3
distance,int64,12
is,bool,2
