## denoiseから前処理まで
* S2

In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc,os,random
import time,datetime
from tqdm import tqdm
from multiprocessing import Pool as ThreadPool

from utils import reduce_mem_usage

In [2]:
def one_hot_encoding(df, cols, is_drop=True):
    for col in cols:
        print("one hot encoding: ", col)
        dummies = pd.get_dummies(pd.Series(df[col]), prefix="oneHot_%s"%col)
        df = pd.concat([df, dummies], axis=1)
    if is_drop:
        df.drop(cols, axis=1, inplace=True)
    return df

In [3]:
def cat_feature(df):
    one_hot_features = [col for col in df.columns if "oneHot" in col]
    if lastk is None:
        num_agg_df = df.groupby("customer_ID", sort=False)[one_hot_features].agg(["mean", "std", "sum", "last"])
    else:
        num_agg_df = df.groupby("customer_ID", sort=False)[one_hot_features].agg(["mean", "std", "sum"])
    num_agg_df.columns = ["".join(x) for x in num_agg_df.columns]

    if lastk is None:
        cat_agg_df = df.groupby("customer_ID", sort=False)[cat_features].agg(["last", "nunique"])
    else:
        cat_agg_df = df.groupby("customer_ID", sort=False)[cat_features].agg(["nunique"])
    cat_agg_df.columns = ["_".join(x) for x in cat_agg_df.columns]

    count_agg_df = df.groupby("customer_ID", sort=False)[["S_2"]].agg(["count"])
    count_agg_df.columns = ["".join(x) for x in count_agg_df.columns]
    df = pd.concat([num_agg_df, cat_agg_df, count_agg_df], axis=1).reset_index()
    print("cat feature shape after engineering", df.shape)
    return df

In [4]:
def num_feature(df):
    if num_features[0][:5] == "rank_":
        num_agg_df = df.groupby("customer_ID", sort=False)[num_features].agg(["last"])
    else:
        if lastk is None:
            num_agg_df = df.groupby("customer_ID", sort=False)[num_features].agg(["mean", "std", "min", "max", "sum", "last"])
        else:
            num_agg_df = df.groupby("customer_ID", sort=False)[num_features].agg(["mean", "std", "min", "max", "sum"])
    num_agg_df.columns = ["_".join(x) for x in num_agg_df.columns]

    if num_features[0][:5] != "rank_":
        for col in num_agg_df.columns:
            num_agg_df[col] = num_agg_df[col] // 0.01  # 多分、x[i] = np.floor(x[i]*100) / 100 の /100だと思う --
    df = num_agg_df.reset_index()
    print("num feature shape after engineering", df.shape)
    return df

In [5]:
def diff_feature(df):
    diff_num_features = [f"diff_{col}" for col in num_features]
    cids = df["customer_ID"].values
    df = df.groupby("customer_ID")[num_features].diff().add_prefix("diff_")
    df.insert(0, "customer_ID", cids)
    if lastk is None:
        num_agg_df = df.groupby("customer_ID", sort=False)[diff_num_features].agg(["mean", "std", "min", "max", "sum", "last"])
    else:
        num_agg_df = df.groupby("customer_ID", sort=False)[diff_num_features].agg(["mean", "std", "min", "max", "sum"])
    num_agg_df.columns = ["_".join(x) for x in num_agg_df.columns]
    for col in num_agg_df.columns:
        num_agg_df[col] = num_agg_df[col] // 0.01

    df = num_agg_df.reset_index()
    print("diff feature shape after engineering", df.shape)

    return df

In [6]:
n_cpu = (os.cpu_count())
transform = [["", "rank_", "ym_rank_"], [""], [""]]

In [7]:
for li, lastk in enumerate([None, 3, 6]):
    for prefix in transform[li]:
        print(li, lastk, prefix)
        df = pd.read_pickle("./input/train_denoise.pkl").append(pd.read_pickle("./input/test_denoise.pkl")).reset_index(drop=True)

        all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2"]]
        cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]
        num_features = [col for col in all_cols if col not in cat_features]

        # S_ と P_ はfillna(0)でいいらしい？ --
        for col in [col for col in df.columns if "S_" in col or "P_" in col]:
            if col != "S_2":
                df[col] = df[col].fillna(0)

        if lastk is not None:
            prefix = f"last{lastk}_" + prefix
            print("all df shape", df.shape)
            df["rank"] = df.groupby("customer_ID")[num_features].rank(pct=True).add_prefix("rank_")
            df = df.loc[df["rank"] <= lastk].reset_index(drop=True)
            df = df.drop(["rank"], axis=1)
            print(f"last {lastk} shape", df.shape)

        if prefix == "rank_":
            cids = df["customer_ID"].values
            df = df.groupby("customer_ID")[num_features].rank(pct=True).add_prefix("rank_")
            df.insert(0, "customer_ID", cids)
            num_features = [f"rank_{col}" for col in num_features]

        if prefix == "ym_rank_":
            cids = df["customer_ID"].values
            df["ym"] = df["S_2"].apply(lambda x: x[:7])
            df = df.groupby("ym")[num_features].rank(pct=True).add_prefix("ym_rank_")
            num_features = [f"ym_rank_{col}" for col in num_features]
        
        if prefix in ["", "last3_"]:
            df = one_hot_encoding(df, cat_features, False)

        vc = df["customer_ID"].value_counts(sort=False).cumsum()
        batch_size = int(np.ceil(len(vc)/n_cpu))
        dfs = []
        start = 0

        for i in range(min(n_cpu, int(np.ceil(len(vc)/batch_size)))):
            vc_ = vc[i*batch_size:(i+1)*batch_size]
            dfs.append(df[start:vc_[-1]])
            start = vc_[-1]
        del df; gc.collect()

        # 並列処理 --
        pool = ThreadPool(n_cpu)
        if prefix in ["", "last3_"]:
            cat_feature_df = pd.concat(pool.map(cat_feature, tqdm(dfs, desc="cat_feature"))).reset_index(drop=True)
            cat_feature_df = reduce_mem_usage(cat_feature_df)
            cat_feature_df.to_pickle(f"./input/{prefix}cat_feature.pkl")
            del cat_feature_df; gc.collect()

        if prefix in ["", "last3_", "last6_", "rank_", "ym_rank_"]:
            num_feature_df = pd.concat(pool.map(num_feature, tqdm(dfs, desc="num_feature"))).reset_index(drop=True)
            num_feature_df = reduce_mem_usage(num_feature_df)
            num_feature_df.to_pickle(f"./input/{prefix}num_feature.pkl")
            del num_feature_df; gc.collect()

##         if prefix in ["", "last3_"]:
##             diff_feature_df = pd.concat(pool.map(diff_feature, tqdm(dfs, desc="diff_feature"))).reset_index(drop=True)
##             diff_feature_df = reduce_mem_usage(diff_feature_df)
##             diff_feature_df.to_pickle(f"./input/{prefix}diff_feature.pkl")
##             del diff_feature_df; gc.collect()

        pool.close()

0 None 
one hot encoding:  B_30
one hot encoding:  B_38
one hot encoding:  D_114
one hot encoding:  D_116
one hot encoding:  D_117
one hot encoding:  D_120
one hot encoding:  D_126
one hot encoding:  D_63
one hot encoding:  D_64
one hot encoding:  D_66
one hot encoding:  D_68


cat_feature:  19%|█▉        | 3/16 [00:03<00:14,  1.15s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  25%|██▌       | 4/16 [00:04<00:14,  1.18s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  31%|███▏      | 5/16 [00:05<00:13,  1.20s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  38%|███▊      | 6/16 [00:07<00:12,  1.24s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  44%|████▍     | 7/16 [00:08<00:11,  1.24s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  50%|█████     | 8/16 [00:09<00:09,  1.24s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  56%|█████▋    | 9/16 [00:10<00:08,  1.25s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  62%|██████▎   | 10/16 [00:12<00:07,  1.25s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  69%|██████▉   | 11/16 [00:13<00:06,  1.24s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  75%|███████▌  | 12/16 [00:14<00:04,  1.23s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  81%|████████▏ | 13/16 [00:15<00:03,  1.23s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  88%|████████▊ | 14/16 [00:17<00:02,  1.23s/it]

cat feature shape after engineering (86471, 208)


cat_feature:  94%|█████████▍| 15/16 [00:18<00:01,  1.23s/it]

cat feature shape after engineering (86471, 208)


cat_feature: 100%|██████████| 16/16 [00:19<00:00,  1.22s/it]


cat feature shape after engineering (86471, 208)
cat feature shape after engineering (86471, 208)
cat feature shape after engineering (86469, 208)
Memory usage after optimization is: 575.86 MB
Decreased by 54.2%


num_feature: 100%|██████████| 16/16 [00:36<00:00,  2.26s/it]


num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86471, 1063)
num feature shape after engineering (86469, 1063)
Memory usage after optimization is: 6149.18 MB
Decreased by -41.5%
0 None rank_


: 

: 