In [1]:
import json
import numpy as np
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from joblib import Parallel, delayed
import multiprocessing

def groupApplyParallel(dfGrouped, func_dict):
    # groupApplyParallel(df.groupby('abc'), {'add':lambda df: df['a']=df['b']+df['c']})
    def multi_agg(group, func_dict):
        for name, func in func_dict.items():
            df[name] = func(group)
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(multi_agg)(group, func_dict) for name, group in dfGrouped)
    return pd.concat(retLst)

def quant(n):
    def quantile_(x):
        return np.quantile(x, n)
    quantile_.__name__ = 'quantile_%s' % n
    return quantile_

def groupAggParallel(dfGrouped, func_dict):
    # groupAggParallel(df.groupby('abc'), {'mean':np.mean, 'count':len})
    def multi_agg(group, group_name, func_list):
        res = list(group_name) if type(group_name) in (list, tuple) else [group_name]
        for func in func_list:
            res.append(func(group))
        return res
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(multi_agg)(group, group_name, func_dict.values()) for group_name, group in dfGrouped)
    print(retLst)
    retLst = pd.DataFrame(retLst, columns=list(dfGrouped.grouper.result_index.names)+list(func_dict.keys()))
    return retLst

from tqdm import tqdm as tqdm_notebook
from tqdm import tqdm
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
import gc
import os

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt

from chinese_calendar import is_workday
import datetime
from functools import partial
from collections import defaultdict
import random
import math

RANDOM_SEED = 1
random.seed(RANDOM_SEED)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 辅助函数

In [2]:
import zipfile

def file2zip(zip_file_name: str, file_names: list):
    """ 将多个文件夹中文件压缩存储为zip
    
    :param zip_file_name:   /root/Document/test.zip
    :param file_names:      ['/root/user/doc/test.txt', ...]
    :return: 
    """
    # 读取写入方式 ZipFile requires mode 'r', 'w', 'x', or 'a'
    # 压缩方式  ZIP_STORED： 存储； ZIP_DEFLATED： 压缩存储
    with zipfile.ZipFile(zip_file_name, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
        for fn in file_names:
            parent_path, name = os.path.split(fn)
            
            # zipfile 内置提供的将文件压缩存储在.zip文件中， arcname即zip文件中存入文件的名称
            # 给予的归档名为 arcname (默认情况下将与 filename 一致，但是不带驱动器盘符并会移除开头的路径分隔符)
            zf.write(fn, arcname=name)

# 写成dict方便索引
def submit(df_test, zip_file_name='./submission/prediction.zip'):
    predict_dict = {}
    cols = ['uid','question','timestamp','predict']
    for _, row in tqdm_notebook(df_test.loc[df_test['response']==-1,cols].iterrows()):
        predict_dict[(int(row[cols[0]]),int(row[cols[1]]),int(row[cols[2]]))] = row[cols[3]]


    with open("input/keyid2idx.json",'r') as f:
        keyid2idx = json.load(f)
    keyidx2id = {
        "questions":dict(zip(keyid2idx['questions'].values(),keyid2idx['questions'].keys())),
        "concepts":dict(zip(keyid2idx['concepts'].values(),keyid2idx['concepts'].keys()))
    }
    df_test_raw = pd.read_csv(os.path.join("./input/pykt_test.csv"))

    predict_str_list = []
    num_test = 0
    for _, row in tqdm_notebook(df_test_raw.iterrows()):
        predict_results = []
        uid = int(row['uid'])
        for question, response, timestamp, is_repeat in zip(row['questions'].split(","), 
                                                 row['responses'].split(","), 
                                                 row['timestamps'].split(","), 
                                                 row['is_repeat'].split(",")):
            question, response,timestamp, is_repeat = int(question), int(response),int(timestamp),int(is_repeat)
            question_raw = int(keyidx2id['questions'][int(question)])
            if is_repeat!=0:#skip the repeat
                continue
            if response == -1:
                num_test += 1
                predict_results.append(predict_dict[(uid,question,timestamp)])
        predict_str = ",".join([str(x) for x in predict_results])
        predict_str_list.append(predict_str)
    print('num_test:',num_test)
    df_submit = pd.DataFrame({"responses":predict_str_list})
    df_submit.to_csv("./submission/prediction.csv",index=False)
    file2zip(zip_file_name=zip_file_name, file_names=['./submission/prediction.csv'])
    
from scipy.misc import derivative
#import numba as nb
#@nb.jit(nopython=True,parallel=True)
def focal_loss_lgb(y_pred, dtrain, alpha, gamma):
    a,g = alpha, gamma
    y_true = dtrain.label
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess
#@nb.jit(nopython=True,parallel=True)
def focal_loss_lgb_eval_error(y_pred, dtrain, alpha, gamma):
    a,g = alpha, gamma
    y_true = dtrain.label
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False
focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 1.)
eval_error = lambda x,y: focal_loss_lgb_eval_error(x, y, 0.25, 1.)

def my_final_sub(y_test_np, output_name='./submission/sub_all.zip'):
    global df_test
    df_test['predict'] = np.array(y_test_np)
    df_test['predict_copy'] = df_test['predict']
    df_test = df_test.reset_index(drop=True)
    df_test_response = df_test[df_test['response']!=-1]
    # 检查平均response，看是否有预测的逻辑问题
    tmp1 = df.groupby('question')['response'].mean()
    tmp2 = df_test[df_test['response']==-1].groupby('question')['predict'].mean()
    print(tmp1.reset_index().merge(tmp2, left_on='question', right_index=True,how='left').head(10))
    
    tmp = df_test.groupby(['uid','timestamp','question'])['response'].agg(['count','mean','max','min'])
    tmp2 = df_test_response.groupby(['uid','timestamp','question'])['response'].agg('mean').rename('split_cross_accrate')
    if 'split_cross_accrate' in df_test.columns:
        df_test.drop(columns=['split_cross_accrate'],inplace=True)
    tmp = tmp[(tmp['max']!=-1)&(tmp['min']==-1)].merge(tmp2, how='left', left_index=True, right_index=True)
    df_test = df_test.merge(tmp['split_cross_accrate'], how='left', left_on=['uid','timestamp','question'], right_index=True)
    df_test['predict'] = df_test[['predict_copy','split_cross_accrate']].apply(lambda x:x['predict_copy'] if np.isnan(x['split_cross_accrate']) or abs(x['predict_copy']-x['split_cross_accrate'])>1.0 else x['split_cross_accrate'], axis=1)
    submit(df_test, zip_file_name=output_name)

## 数据处理

In [3]:
df = pd.read_feather('./input/df_train.feather')
df_test = pd.read_feather('./input/df_test.feather')

In [22]:
def get_rows_washed(df, df_test, verbose=0):
    df_response = pd.concat([df[df['response']!=-1], df_test[df_test['response']!=-1]], axis=0)
    tmp = df_response.groupby('question')['response'].agg('mean').rename('question_accrate')
    tmp2 = df_response.groupby(['uid','timestamp'])['response'].agg(['count','mean']).rename(columns={'count':'tmp_count','mean':'tmp_mean'})
    tmp3 = df_response.groupby('uid')['response'].agg('mean').rename('uid_accrate')
    
    def wash(df):
        if verbose: print(df.shape)
        df = df.merge(tmp, how='left', left_on='question', right_index=True).merge(tmp2, how='left', left_on=['uid','timestamp'], right_index=True).merge(tmp3, how='left', left_on='uid', right_index=True)
        #"""
        # 删除同一时间内提交多道题，但是正确率很低的情况，说明可能乱做的
        df = df[(~((((df['tmp_mean']<0.3)&(df['uid_accrate']>0.7))|(df['tmp_mean']==0.)) & (df['tmp_count']>=10)))|(df['response']==-1)]
        if verbose: print(df.shape)
        #"""
        # 删除差生、较难的题目，不该做对，会拉高这道题的平均正确率
        df = df[~((df['type']==1)&\
                  (((df['question_accrate']<0.7)&(df['uid_accrate']<0.5))|((df['question_accrate']<0.5)&(df['uid_accrate']<0.7)))&\
                  (df['response']==1))]
        if verbose: print(df.shape)
        #"""
        df.drop(columns=['tmp_count','tmp_mean','question_accrate','uid_accrate'], inplace=True)
        return df.reset_index(drop=True)
    df = wash(df)
    df_test = wash(df_test)
    return df, df_test
# df, df_test = get_rows_washed(df, df_test, verbose=0)

In [23]:
def preprocess(df):
    # 时间相关特征
    df['time_day'] = (df['timestamp']/86400000+8/24).astype('int')
    df['time_hour'] = ((df['timestamp']/86400000+8/24 - (df['timestamp']/86400000+8/24).astype('int'))*24).astype('int')
    def tmp_func(x):
        from chinese_calendar import is_workday
        import datetime
        try:
            res = int(is_workday(datetime.datetime.utcfromtimestamp(x//1000) + datetime.timedelta(hours=8)))
        except:
            res = -1
        return res
    df['time_is_workday'] = df['timestamp'].parallel_apply(tmp_func)
    def tmp_func(x):
        import datetime
        return (datetime.datetime.utcfromtimestamp(x//1000) + datetime.timedelta(hours=8)).weekday()
    df['time_weekday'] = df['timestamp'].parallel_apply(tmp_func)
    def tmp_func(x):
        import datetime
        return (datetime.datetime.utcfromtimestamp(x//1000) + datetime.timedelta(hours=8)).year
    df['time_year'] = df['timestamp'].parallel_apply(tmp_func)
    
    return df 

df = preprocess(df)
df_test = preprocess(df_test)
gc.collect()

0

In [24]:
# 删除训练集中2018年的老数据
print(df['time_year'].value_counts(),df_test['time_year'].value_counts())
df = df[df['time_year']!=2018].reset_index(drop=True)
# 删除用不到的列
df = df.drop(['content','kc','analysis','concepts_raw','concepts'],axis=1)
df_test = df_test.drop(['content','kc','analysis','concepts_raw','concepts'],axis=1)

2020    2565765
2021    1880948
2018         23
Name: time_year, dtype: int64 2020    635300
2021    467503
2001         7
Name: time_year, dtype: int64


In [25]:
def reduce_mem_usage(props, use_uint=False):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0 and use_uint:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    print('nan_list:', NAlist)
    return props, NAlist
df, nalist = reduce_mem_usage(df)
df_test, nalist = reduce_mem_usage(df_test)
gc.collect()

Memory usage of properties dataframe is : 1017.7719573974609  MB
******************************
Column:  uid
dtype before:  int64
dtype after:  int16
******************************
******************************
Column:  question
dtype before:  int64
dtype after:  int16
******************************
******************************
Column:  response
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  timestamp
dtype before:  int64
dtype after:  int64
******************************
******************************
Column:  type
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  concept_cnt
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  concept_hot_cnt
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  concept_1
dtype before:  int64
dtype after:  int16

0

## 特征工程

### 全局特征

In [26]:
# 将测试集的已有信息也放进来（特征工程和训练集）
def feature_engineering(df_tr, df_test, verbose=0):
    # 不同统计方式得到的特征，用于不同类型的模型
    columns = {
        'at':[], # 可以直接使用的特征
        'bs':[], # 统计这次提交以前的label
        'bt':[], # 统计今天以前的label
        'bh':[]  # 只使用一半的label进行统计
    }
    df_tr['data_type'] = 'tr'
    df_tr['raw_index'] = df_tr.index.values
    df_test['data_type'] = 'test'
    df_test['raw_index'] = df_test.index.values
    df = pd.concat([df_tr, df_test], axis=0).sort_values(['uid','timestamp','question']).reset_index(drop=True)
    
    # 在这之前已经提交了多少题目了
    tmp = df.groupby(['uid','timestamp'])['response'].count()
    tmp = (tmp.groupby('uid').cumsum()-tmp).rename('uid_record_cumsum')
    df = df.merge(tmp, how='left', left_on=['uid','timestamp'], right_index=True, suffixes=(None,'_y'))
    # 一共提交多少题目
    tmp = df.groupby(['uid'])['response'].count().rename('uid_record_sum')
    df = df.merge(tmp, how='left', left_on=['uid'], right_index=True, suffixes=(None,'_y'))
    columns['at'].extend(['uid_record_cumsum','uid_record_sum'])
    
    df_response = df[df['response']!=-1].reset_index(drop=True)
    if verbose: print('df shape:', df.shape)
    
    # 现存的特征，放入columns列表
    columns['at'].extend(['question','timestamp', 'type', 'concept_cnt', 'concept_hot_cnt', 'concept_1','concept_2', 'concept_3', 
                          'concept_4','concept_5', 'concept_6', 'content_cnt', 'kc_group_cnt', 'kc_cnt',
                          'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5', 'kc_6', 'kc_7', 'kc_8', 'analysis_cnt',
                         'time_day','time_hour','time_is_workday','time_weekday','time_year',])
    
    # concept
    df_tmp = pd.concat([
        df_response.loc[df_response['concept_1']!=-1,['uid','response','concept_1','time_day','time_hour','timestamp','type']].rename(columns={'concept_1':'concept'}),
        df_response.loc[df_response['concept_2']!=-1,['uid','response','concept_2','time_day','time_hour','timestamp','type']].rename(columns={'concept_2':'concept'}),
        df_response.loc[df_response['concept_3']!=-1,['uid','response','concept_3','time_day','time_hour','timestamp','type']].rename(columns={'concept_3':'concept'}),
        df_response.loc[df_response['concept_4']!=-1,['uid','response','concept_4','time_day','time_hour','timestamp','type']].rename(columns={'concept_4':'concept'}),
        df_response.loc[df_response['concept_5']!=-1,['uid','response','concept_5','time_day','time_hour','timestamp','type']].rename(columns={'concept_5':'concept'}),
        df_response.loc[df_response['concept_6']!=-1,['uid','response','concept_6','time_day','time_hour','timestamp','type']].rename(columns={'concept_6':'concept'}),
    ])
    col = 'concept'
    tmp = df_tmp.groupby(col)['response'].agg(['count','mean']).rename(columns={'count':col+'_showcnt', 'mean':col+'_accrate'})
    df=df.merge(tmp, how='left', left_on=col+'_1', right_index=True, suffixes=(None,'_1'))\
        .merge(tmp, how='left', left_on=col+'_2', right_index=True, suffixes=(None,'_2'))\
        .merge(tmp, how='left', left_on=col+'_3', right_index=True, suffixes=(None,'_3'))\
        .merge(tmp, how='left', left_on=col+'_4', right_index=True, suffixes=(None,'_4'))\
        .merge(tmp, how='left', left_on=col+'_5', right_index=True, suffixes=(None,'_5'))\
        .merge(tmp, how='left', left_on=col+'_6', right_index=True, suffixes=(None,'_6'))\
        .rename(columns={col+'_showcnt':col+'_showcnt'+'_1',col+'_accrate':col+'_accrate'+'_1'})\
        .fillna(-1.)
    col2 = 'type'  # {'count':len,'mean':np.mean}
    tmp = df_tmp.groupby([col,col2])['response'].agg(['count','mean']).rename(columns={'count':col+'_'+col2+'_showcnt', 'mean':col+'_'+col2+'_accrate'})
    df=df.merge(tmp, how='left', left_on=[col+'_1',col2], right_index=True, suffixes=(None,'_1'))\
        .merge(tmp, how='left', left_on=[col+'_2',col2], right_index=True, suffixes=(None,'_2'))\
        .merge(tmp, how='left', left_on=[col+'_3',col2], right_index=True, suffixes=(None,'_3'))\
        .merge(tmp, how='left', left_on=[col+'_4',col2], right_index=True, suffixes=(None,'_4'))\
        .merge(tmp, how='left', left_on=[col+'_5',col2], right_index=True, suffixes=(None,'_5'))\
        .merge(tmp, how='left', left_on=[col+'_6',col2], right_index=True, suffixes=(None,'_6'))\
        .rename(columns={col+'_'+col2+'_showcnt':col+'_'+col2+'_showcnt'+'_1',col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_1'})\
        .fillna(-1.)
    df[col+'_min_accrate'] = df[['concept_accrate_1','concept_accrate_2','concept_accrate_3','concept_accrate_4','concept_accrate_5','concept_accrate_6']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+'_mean_accrate'] = df[['concept_accrate_1','concept_accrate_2','concept_accrate_3','concept_accrate_4','concept_accrate_5','concept_accrate_6']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+'_max_accrate'] = df[['concept_accrate_1','concept_accrate_2','concept_accrate_3','concept_accrate_4','concept_accrate_5','concept_accrate_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    df[col+'_min_showcnt'] = df[['concept_showcnt_1','concept_showcnt_2','concept_showcnt_3','concept_showcnt_4','concept_showcnt_5','concept_showcnt_6']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+'_mean_showcnt'] = df[['concept_showcnt_1','concept_showcnt_2','concept_showcnt_3','concept_showcnt_4','concept_showcnt_5','concept_showcnt_6']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+'_max_showcnt'] = df[['concept_showcnt_1','concept_showcnt_2','concept_showcnt_3','concept_showcnt_4','concept_showcnt_5','concept_showcnt_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    columns['at'].extend([col+'_min_accrate',col+'_min_showcnt',col+'_mean_accrate',col+'_mean_showcnt',col+'_max_accrate',col+'_max_showcnt'])
    for i in range(1,7):
        columns['at'].extend([col+'_showcnt_'+str(i),col+'_accrate_'+str(i),col+'_'+col2+'_showcnt_'+str(i),col+'_'+col2+'_accrate_'+str(i)])
    df[col+col2+'_min_accrate'] = df[['concept_type_accrate_1','concept_type_accrate_2','concept_type_accrate_3','concept_type_accrate_4','concept_type_accrate_5','concept_type_accrate_6']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+col2+'_mean_accrate'] = df[['concept_type_accrate_1','concept_type_accrate_2','concept_type_accrate_3','concept_type_accrate_4','concept_type_accrate_5','concept_type_accrate_6']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+col2+'_max_accrate'] = df[['concept_type_accrate_1','concept_type_accrate_2','concept_type_accrate_3','concept_type_accrate_4','concept_type_accrate_5','concept_type_accrate_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    df[col+col2+'_min_showcnt'] = df[['concept_type_showcnt_1','concept_type_showcnt_2','concept_type_showcnt_3','concept_type_showcnt_4','concept_type_showcnt_5','concept_type_showcnt_6']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+col2+'_mean_showcnt'] = df[['concept_type_showcnt_1','concept_type_showcnt_2','concept_type_showcnt_3','concept_type_showcnt_4','concept_type_showcnt_5','concept_type_showcnt_6']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+col2+'_max_showcnt'] = df[['concept_type_showcnt_1','concept_type_showcnt_2','concept_type_showcnt_3','concept_type_showcnt_4','concept_type_showcnt_5','concept_type_showcnt_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    columns['at'].extend([col+col2+'_min_accrate',col+col2+'_min_showcnt',col+col2+'_mean_accrate',col+col2+'_mean_showcnt',col+col2+'_max_accrate',col+col2+'_max_showcnt'])
    
    # kc
    df_tmp = pd.concat([
        df_response.loc[df_response['kc_1']!=-1,['uid','response','kc_1','time_day','time_hour','timestamp','type']].rename(columns={'kc_1':'kc'}),
        df_response.loc[df_response['kc_2']!=-1,['uid','response','kc_2','time_day','time_hour','timestamp','type']].rename(columns={'kc_2':'kc'}),
        df_response.loc[df_response['kc_3']!=-1,['uid','response','kc_3','time_day','time_hour','timestamp','type']].rename(columns={'kc_3':'kc'}),
        df_response.loc[df_response['kc_4']!=-1,['uid','response','kc_4','time_day','time_hour','timestamp','type']].rename(columns={'kc_4':'kc'}),
        df_response.loc[df_response['kc_5']!=-1,['uid','response','kc_5','time_day','time_hour','timestamp','type']].rename(columns={'kc_5':'kc'}),
        df_response.loc[df_response['kc_6']!=-1,['uid','response','kc_6','time_day','time_hour','timestamp','type']].rename(columns={'kc_6':'kc'}),
        df_response.loc[df_response['kc_7']!=-1,['uid','response','kc_7','time_day','time_hour','timestamp','type']].rename(columns={'kc_7':'kc'}),
        df_response.loc[df_response['kc_8']!=-1,['uid','response','kc_8','time_day','time_hour','timestamp','type']].rename(columns={'kc_8':'kc'}),
    ])
    col = 'kc'
    tmp = df_tmp.groupby(col)['response'].agg(['count','mean']).rename(columns={'count':col+'_showcnt', 'mean':col+'_accrate'})
    df=df.merge(tmp, how='left', left_on=col+'_1', right_index=True, suffixes=(None,'_1'))\
        .merge(tmp, how='left', left_on=col+'_2', right_index=True, suffixes=(None,'_2'))\
        .merge(tmp, how='left', left_on=col+'_3', right_index=True, suffixes=(None,'_3'))\
        .merge(tmp, how='left', left_on=col+'_4', right_index=True, suffixes=(None,'_4'))\
        .merge(tmp, how='left', left_on=col+'_5', right_index=True, suffixes=(None,'_5'))\
        .merge(tmp, how='left', left_on=col+'_6', right_index=True, suffixes=(None,'_6'))\
        .merge(tmp, how='left', left_on=col+'_7', right_index=True, suffixes=(None,'_7'))\
        .merge(tmp, how='left', left_on=col+'_8', right_index=True, suffixes=(None,'_8'))\
        .rename(columns={col+'_showcnt':col+'_showcnt'+'_1',col+'_accrate':col+'_accrate'+'_1'}).fillna(-1)
    col2 = 'type'
    tmp = df_tmp.groupby([col,col2])['response'].agg(['count','mean']).rename(columns={'count':col+'_'+col2+'_showcnt', 'mean':col+'_'+col2+'_accrate'})
    df=df.merge(tmp, how='left', left_on=[col+'_1',col2], right_index=True, suffixes=(None,'_1'))\
        .merge(tmp, how='left', left_on=[col+'_2',col2], right_index=True, suffixes=(None,'_2'))\
        .merge(tmp, how='left', left_on=[col+'_3',col2], right_index=True, suffixes=(None,'_3'))\
        .merge(tmp, how='left', left_on=[col+'_4',col2], right_index=True, suffixes=(None,'_4'))\
        .merge(tmp, how='left', left_on=[col+'_5',col2], right_index=True, suffixes=(None,'_5'))\
        .merge(tmp, how='left', left_on=[col+'_6',col2], right_index=True, suffixes=(None,'_6'))\
        .merge(tmp, how='left', left_on=[col+'_7',col2], right_index=True, suffixes=(None,'_7'))\
        .merge(tmp, how='left', left_on=[col+'_8',col2], right_index=True, suffixes=(None,'_8'))\
        .rename(columns={col+'_'+col2+'_showcnt':col+'_'+col2+'_showcnt'+'_1',col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_1'}).fillna(-1)
    for i in range(1,8):
        columns['at'].extend([col+'_showcnt_'+str(i),col+'_accrate_'+str(i),col+'_'+col2+'_showcnt_'+str(i),col+'_'+col2+'_accrate_'+str(i)])
    df[col+'_min_accrate'] = df[['kc_accrate_1','kc_accrate_2','kc_accrate_3','kc_accrate_4','kc_accrate_5','kc_accrate_6','kc_accrate_7','kc_accrate_8']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+'_mean_accrate'] = df[['kc_accrate_1','kc_accrate_2','kc_accrate_3','kc_accrate_4','kc_accrate_5','kc_accrate_6','kc_accrate_7','kc_accrate_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+'_max_accrate'] = df[['kc_accrate_1','kc_accrate_2','kc_accrate_3','kc_accrate_4','kc_accrate_5','kc_accrate_6','kc_accrate_7','kc_accrate_8']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    df[col+'_min_showcnt'] = df[['kc_showcnt_1','kc_showcnt_2','kc_showcnt_3','kc_showcnt_4','kc_showcnt_5','kc_showcnt_6','kc_showcnt_7','kc_showcnt_8']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+'_mean_showcnt'] = df[['kc_showcnt_1','kc_showcnt_2','kc_showcnt_3','kc_showcnt_4','kc_showcnt_5','kc_showcnt_6','kc_showcnt_7','kc_showcnt_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+'_max_showcnt'] = df[['kc_showcnt_1','kc_showcnt_2','kc_showcnt_3','kc_showcnt_4','kc_showcnt_5','kc_showcnt_6','kc_showcnt_7','kc_showcnt_8']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    columns['at'].extend([col+'_min_accrate',col+'_min_showcnt',col+'_mean_accrate',col+'_mean_showcnt',col+'_max_accrate',col+'_max_showcnt'])
    df[col+col2+'_min_accrate'] = df[['kc_type_accrate_1','kc_type_accrate_2','kc_type_accrate_3','kc_type_accrate_4','kc_type_accrate_5','kc_type_accrate_6','kc_type_accrate_7','kc_type_accrate_8']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+col2+'_mean_accrate'] = df[['kc_type_accrate_1','kc_type_accrate_2','kc_type_accrate_3','kc_type_accrate_4','kc_type_accrate_5','kc_type_accrate_6','kc_type_accrate_7','kc_type_accrate_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+col2+'_max_accrate'] = df[['kc_type_accrate_1','kc_type_accrate_2','kc_type_accrate_3','kc_type_accrate_4','kc_type_accrate_5','kc_type_accrate_6','kc_type_accrate_7','kc_type_accrate_8']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    df[col+col2+'_min_showcnt'] = df[['kc_type_showcnt_1','kc_type_showcnt_2','kc_type_showcnt_3','kc_type_showcnt_4','kc_type_showcnt_5','kc_type_showcnt_6','kc_type_showcnt_7','kc_type_showcnt_8']].parallel_apply(lambda x: min([xx if xx!=-1 else 99999999 for xx in x]), axis=1)
    df[col+col2+'_mean_showcnt'] = df[['kc_type_showcnt_1','kc_type_showcnt_2','kc_type_showcnt_3','kc_type_showcnt_4','kc_type_showcnt_5','kc_type_showcnt_6','kc_type_showcnt_7','kc_type_showcnt_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
    df[col+col2+'_max_showcnt'] = df[['kc_type_showcnt_1','kc_type_showcnt_2','kc_type_showcnt_3','kc_type_showcnt_4','kc_type_showcnt_5','kc_type_showcnt_6','kc_type_showcnt_7','kc_type_showcnt_8']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
    columns['at'].extend([col+col2+'_min_accrate',col+col2+'_min_showcnt',col+col2+'_mean_accrate',col+col2+'_mean_showcnt',col+col2+'_max_accrate',col+col2+'_max_showcnt'])
    if verbose: print('df shape:', df.shape)
    
    # question
    col = 'question'
    tmp = df.groupby(col)['response'].agg('count').rename(col+'_showcnt')
    df=df.merge(tmp, how='left', left_on=col, right_index=True, suffixes=(None,'_1'))
    tmp = df_response.groupby(col)['response'].agg('mean').rename(col+'_accrate')
    df=df.merge(tmp, how='left', left_on=col, right_index=True, suffixes=(None,'_1'))
    columns['at'].extend([col+'_showcnt',col+'_accrate'])
    
    # type
    cols = ['uid','type','time_hour','time_is_workday','time_weekday']
    for col in cols:
        tmp = df_response.groupby(col)['response'].agg(['count','mean']).rename(columns={'count':col+'_showcnt', 'mean':col+'_accrate'})
        df=df.merge(tmp, how='left', left_on=col, right_index=True, suffixes=(None,'_1'))
        columns['at'].extend([col+'_showcnt',col+'_accrate'])
    for col in ['type','time_is_workday']:
        for cc in ['showcnt','accrate']:
            df[f'{col}_1_{cc}'] = df[f'{col}_{cc}']*df[col]
            df[f'{col}_0_{cc}'] = df[f'{col}_{cc}']*(1-df[col])
    if verbose: print('df shape:', df.shape)
    
    # 同时提交的question：正确率的分布与平时不同，可考虑：
    # 1. 剔除后再进行全局统计（需注释掉原question统计）
    # 2. 用其作为一个特征
    col, col2 = 'timestamp', 'question'
    tmp = df.groupby([col,col2])['uid'].agg('nunique').rename(col+'_'+col2+'_cnt')
    df=df.merge(tmp, how='left', left_on=[col,col2], right_index=True, suffixes=(None,'_1'))
    #tmp1 = df_response.groupby([col,col2])['response'].agg('mean').rename(col+'_'+col2+'_accrate')
    #tmp = tmp[tmp>=10].reset_index().merge(tmp1[tmp1<0.0001], how='left', left_on=['timestamp','question'], right_index=True)[[col,col2,col+'_'+col2+'_accrate']]
    #df=df.merge(tmp, how='left', left_on=[col,col2], right_on=[col,col2], suffixes=(None,'_1'))
    columns['at'].extend([col+'_'+col2+'_cnt']) # , col+'_'+col2+'_accrate'
    #tmp = df[df[col+'_'+col2+'_cnt']<=2].groupby('question')['response'].agg(['count','mean']).rename(columns={'count':'question_showcnt_without_'+col+'_'+col2, 'mean':'question_accrate_without_'+col+'_'+col2})
    #df=df.merge(tmp, how='left', left_on=[col2], right_index=True, suffixes=(None,'_1'))
    #columns['at'].extend(['question_accrate_without_'+col+'_'+col2, 'question_showcnt_without_'+col+'_'+col2])
    if verbose: print('df shape:', df.shape)
    
    # """
    # 题目迄今做过次数
    # 短期内题目做的次数
    # 短期内提交次数 / 长期内提交次数
    def tmp_func(df, timelimit=3600000, print_timedistance=None, col='question', only_once=False, timestamp_once=False):
        print_timedistance = '_asc' if print_timedistance is None else '_'+print_timedistance
        tt, qq = list(df['timestamp']), list(df[col])
        times_dct = defaultdict(int)
        unique_submittimes = 0  # 截止此时提交了多少次
        question_submittimes = 0  # 截止此时提交了多少个问题
        timestamp_question_submit = 0
        last_question_ti = -1  # 当前时间与之前记录的时间不一致，此时需要写入“截止此时提交了多少个问题”
        times_continue_learning = 0  # 连续学习时长
        times_continue_learning_res = []
        times_until_now_res = []
        unique_submittimes_res = []
        question_submittimes_res = []
        distance_to_last_timestamp_res = []
        timestamp_question_submit_res = []
        s = -1  # 时间范围节点的开始位置
        for e,t,q in zip(range(len(tt)),tt,qq):
            if s == -1:
                times_continue_learning = 0
                s = 0
                unique_submittimes = 1
                times_dct[q] += 1
                question_submittimes = 1
                last_question_ti = 0
                timestamp_question_submit = 1
            else:
                if tt[last_question_ti] != t:
                    question_submittimes_res.extend([question_submittimes]*(e-last_question_ti))
                    distance_to_last_timestamp_res.extend([int((t-tt[last_question_ti])/1000)]*(e-last_question_ti))
                    timestamp_question_submit_res.extend([timestamp_question_submit]*(e-last_question_ti))
                    timestamp_question_submit = 1
                    last_question_ti = e
                else:
                    timestamp_question_submit += 1
                while (t-tt[s]>timelimit and print_timedistance=='_asc') or (tt[s]-t>timelimit and print_timedistance=='_desc'):
                    question_submittimes -= 1
                    if tt[s] != tt[s+1]:
                        unique_submittimes-=1
                    times_dct[qq[s]] -= 1
                    s += 1
                if t != tt[e-1]:
                    unique_submittimes+=1
                times_dct[q] += 1
                question_submittimes += 1
                if t - tt[e-1] <= 10800000:
                    times_continue_learning += int((t - tt[e-1])/60000)
                else:
                    times_continue_learning = 0
            times_continue_learning_res.append(times_continue_learning)
            times_until_now_res.append(times_dct[q])
            unique_submittimes_res.append(unique_submittimes)
        question_submittimes_res.extend([question_submittimes]*(len(tt)-last_question_ti))
        distance_to_last_timestamp_res.extend([int((t-tt[last_question_ti])/1000)]*(len(tt)-last_question_ti))
        timestamp_question_submit_res.extend([timestamp_question_submit]*(len(tt)-last_question_ti))
        df[f'uid_{col}_utilnow_{str(int(timelimit/1000))}{print_timedistance}'] = times_until_now_res  # 截止此时，当前问题出现了多少次
        if timestamp_once:
            df[f'uid_question_submittimes_{str(int(timelimit/1000))}{print_timedistance}'] = question_submittimes_res  # 截止此时，提交了多少个问题
            df[f'uid_unique_submittimes_{str(int(timelimit/1000))}{print_timedistance}'] = unique_submittimes_res  # 截止此时，提交了多少次
            if only_once:
                df[f'uid_timedistance_to_last_submit{print_timedistance}'] = distance_to_last_timestamp_res # 距离上次提交过去了多久
                if print_timedistance == '_asc':
                    df[f'uid_timestamp_question_submit'] = timestamp_question_submit_res # 当前timestamp提交了多少个问题
                    df['uid_continue_learning_mins'] = times_continue_learning_res  # 截止当前timestamp，持续学习了多久（中断3小时即重新计算）
        return df

    
    col = 'question'
    timelimit = 3600*1000
    df_tmp_groupby = df[['uid','timestamp',col]].groupby(['uid'])
    tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col, only_once=True, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True)[cols]
    
    df['timediff1'] = (df['timestamp'].diff(1)).fillna(0).astype('int32')
    df['timediff-1'] = (df['timestamp'].diff(-1)).fillna(0).astype('int32')
    tmp_uid_diff = (df['uid'].diff(-1)).fillna(-1).astype('int32')
    df.loc[tmp_uid_diff!=0,'timediff-1'] = 0
    df = df.merge(df[df['timediff-1']<-10800000].groupby('uid')['uid_continue_learning_mins'].mean().rename('uid_avg_continue_learning_mins'), left_on='uid', right_index=True,how='left')
    df['uid_continue_learning_mins_level'] = df['uid_continue_learning_mins']/df['uid_avg_continue_learning_mins']
    if verbose: print('df shape:', df.shape)
    
    timelimit = 6*3600*1000
    tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True)[cols]
    timelimit = 3*24*3600*1000
    tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True)[cols]
    timelimit = 14*24*3600*1000
    tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True)[cols]
    timelimit = 365*24*3600*1000
    tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True)[cols]
    timelimit = 3600*1000
    tmp=df[['uid','timestamp',col]].sort_values(by=['uid','timestamp','question'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col, only_once=True, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True).sort_values(by=['uid','timestamp','question'], ascending=True).reset_index(drop=True)[cols]
    timelimit = 6*3600*1000
    tmp=df[['uid','timestamp',col]].sort_values(by=['uid','timestamp','question'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col, only_once=True, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True).sort_values(by=['uid','timestamp','question'], ascending=True).reset_index(drop=True)[cols]
    timelimit = 3*3600*1000
    tmp=df[['uid','timestamp',col]].sort_values(by=['uid','timestamp','question'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col, only_once=True, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True).sort_values(by=['uid','timestamp','question'], ascending=True).reset_index(drop=True)[cols]
    timelimit = 14*24*3600*1000
    tmp=df[['uid','timestamp',col]].sort_values(by=['uid','timestamp','question'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True).sort_values(by=['uid','timestamp','question'], ascending=True).reset_index(drop=True)[cols]
    timelimit = 365*3600*1000
    tmp=df[['uid','timestamp',col]].sort_values(by=['uid','timestamp','question'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col, only_once=True, timestamp_once=True))
    cols = [x for x in tmp.columns if x not in ['uid','timestamp',col]]
    df[cols] = tmp.reset_index(drop=True).sort_values(by=['uid','timestamp','question'], ascending=True).reset_index(drop=True)[cols]

    for col in ['concept']:  # 'kc'
        if col == 'kc':
            df_tmp = pd.concat([
                df.loc[df['kc_1']!=-1,['uid','kc_1','timestamp','question']].rename(columns={'kc_1':'kc'}),
                df.loc[df['kc_2']!=-1,['uid','kc_2','timestamp','question']].rename(columns={'kc_2':'kc'}),
                df.loc[df['kc_3']!=-1,['uid','kc_3','timestamp','question']].rename(columns={'kc_3':'kc'}),
                df.loc[df['kc_4']!=-1,['uid','kc_4','timestamp','question']].rename(columns={'kc_4':'kc'}),
                df.loc[df['kc_5']!=-1,['uid','kc_5','timestamp','question']].rename(columns={'kc_5':'kc'}),
                df.loc[df['kc_6']!=-1,['uid','kc_6','timestamp','question']].rename(columns={'kc_6':'kc'}),
                df.loc[df['kc_7']!=-1,['uid','kc_7','timestamp','question']].rename(columns={'kc_7':'kc'}),
                df.loc[df['kc_8']!=-1,['uid','kc_8','timestamp','question']].rename(columns={'kc_8':'kc'}),
            ])
        elif col == 'concept':
            df_tmp = pd.concat([
                df.loc[df['concept_1']!=-1,['uid','concept_1','timestamp','question']].rename(columns={'concept_1':'concept'}),
                df.loc[df['concept_2']!=-1,['uid','concept_2','timestamp','question']].rename(columns={'concept_2':'concept'}),
                df.loc[df['concept_3']!=-1,['uid','concept_3','timestamp','question']].rename(columns={'concept_3':'concept'}),
                df.loc[df['concept_4']!=-1,['uid','concept_4','timestamp','question']].rename(columns={'concept_4':'concept'}),
                df.loc[df['concept_5']!=-1,['uid','concept_5','timestamp','question']].rename(columns={'concept_5':'concept'}),
                df.loc[df['concept_6']!=-1,['uid','concept_6','timestamp','question']].rename(columns={'concept_6':'concept'}),
            ])
        timelimit = 3600*1000
        df_tmp_groupby = df_tmp[['uid','timestamp', 'question',col]].groupby(['uid'])
        tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 6*3600*1000
        tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 3*24*3600*1000
        tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 14*24*3600*1000
        tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 365*24*3600*1000
        tmp=df_tmp_groupby.parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='asc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 3600*1000
        tmp=df_tmp.sort_values(by=['uid','timestamp'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 6*3600*1000
        tmp=df_tmp.sort_values(by=['uid','timestamp'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 3*24*3600*1000
        tmp=df_tmp.sort_values(by=['uid','timestamp'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 14*24*3600*1000
        tmp=df_tmp.sort_values(by=['uid','timestamp'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        timelimit = 365*24*3600*1000
        tmp=df_tmp.sort_values(by=['uid','timestamp'], ascending=False).groupby(['uid']).parallel_apply(partial(tmp_func, timelimit=timelimit, print_timedistance='desc', col=col))
        tmp = tmp.reset_index(drop=True)[[x for x in tmp.columns if x != col]].groupby(['uid','timestamp','question']).max()
        df = df.merge(tmp, left_on=['uid','timestamp','question'], right_index=True)
        
        range_n = 8 if col == 'kc' else 6
        df_tmp2 = df[['uid','question','kc_1','kc_2','kc_3','kc_4','kc_5','kc_6','kc_7','kc_8','concept_1','concept_2','concept_3','concept_4','concept_5','concept_6']].copy()
        for i in range(1, range_n+1):
            df_tmp2 = df_tmp2.merge(df_tmp.groupby([col])['timestamp'].min().rename(f'new_{col}_{str(i)}_days'), how='left', left_on=f'{col}_{str(i)}', right_index=True)
            df_tmp2 = df_tmp2.merge(df_tmp.groupby(['uid',col])['timestamp'].min().rename(f'uid_new_{col}_{str(i)}_days'), how='left', left_on=['uid',f'{col}_{str(i)}'], right_index=True)
        df[f'new_{col}_days_min'] = df_tmp2[[f'new_{col}_{str(i)}_days' for i in range(1, range_n+1)]].min(axis=1)
        df[f'new_{col}_days_min'] = df[f'new_{col}_days_min']/1000/3600/24
        df[f'uid_new_{col}_days_min'] = df_tmp2[[f'uid_new_{col}_{str(i)}_days' for i in range(1, range_n+1)]].min(axis=1)
        df[f'uid_new_{col}_days_min'] = df[f'uid_new_{col}_days_min']/1000/3600/24
        columns['at'].extend([f'new_{col}_days_min'])
        columns['at'].extend([f'uid_new_{col}_days_min'])
    
    if verbose: print('df shape:', df.shape)
    df[f'uid_timestamp_question_meansubmit'] = df[f'uid_timestamp_question_submit'] / (df[f'uid_timedistance_to_last_submit_asc']+0.00001)
    
    columns['at'].extend(['uid_concept_utilnow_21600_asc', 'uid_concept_utilnow_3600_asc', 'uid_concept_utilnow_94608000_asc', 'uid_concept_utilnow_94608000_desc', 'uid_kc_utilnow_21600_asc', 'uid_kc_utilnow_3600_asc', 'uid_kc_utilnow_94608000_asc', 'uid_kc_utilnow_94608000_desc', 'uid_question_submittimes_21600_asc', 'uid_question_submittimes_3600_asc', 'uid_question_submittimes_94608000_asc', 'uid_question_submittimes_94608000_desc', 'uid_question_utilnow_21600_asc', 'uid_question_utilnow_3600_asc', 'uid_question_utilnow_94608000_asc', 'uid_question_utilnow_94608000_desc', 'uid_timedistance_to_last_submit_asc', 'uid_timedistance_to_last_submit_desc', 'uid_timestamp_question_meansubmit', 'uid_timestamp_question_submit', 'uid_unique_submittimes_21600_asc', 'uid_unique_submittimes_3600_asc', 'uid_unique_submittimes_94608000_asc', 'uid_unique_submittimes_94608000_desc'])
    # """
    
    col = 'uid'
    for col2 in ['timestamp']:
        tmp = df.groupby([col,col2])['response'].agg(['count']).rename(columns={'count':col+'_'+col2+'_showcnt'})
        df=df.merge(tmp, how='left', left_on=[col,col2], right_index=True, suffixes=(None,'_1'))
        columns['at'].extend([col+'_'+col2+'_showcnt'])
        
    df = df.merge(df.groupby(['question'])['timestamp'].min().rename('new_question_days'), how='left', left_on='question', right_index=True)
    df[f'new_question_days'] = df[f'new_question_days']/1000/3600/24
    columns['at'].extend(['new_question_days'])
    df = df.merge(df.groupby(['uid','question'])['timestamp'].min().rename('uid_new_question_days'), how='left', left_on=['uid','question'], right_index=True)
    df[f'uid_new_question_days'] = df[f'uid_new_question_days']/1000/3600/24
    columns['at'].extend(['uid_new_question_days'])
    
    df.fillna(-1, inplace=True)
    if verbose: print('df shape:', df.shape)
    
    df_tr = df[df['data_type']=='tr'].reset_index(drop=True)
    df_tr.set_index(df_tr['raw_index'], inplace=True)
    df_test = df[df['data_type']=='test'].reset_index(drop=True)
    df_test.set_index(df_test['raw_index'], inplace=True)
    
    return df_tr, df_test, columns

"""
df_tmp,df_tmp2, columns_tmp = feature_engineering(df.iloc[:1000].copy()[['uid', 'question', 'response', 'timestamp', 'type', 'concept_cnt',
       'concept_hot_cnt', 'concept_1', 'concept_2', 'concept_3',
       'concept_4', 'concept_5', 'concept_6', 'content_cnt',
       'kc_group_cnt', 'kc_cnt', 'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5',
       'kc_6', 'kc_7', 'kc_8', 'analysis_cnt', 'time_day', 'time_hour',
       'time_is_workday', 'time_weekday', 'time_year']], df_test.iloc[:1000].copy()[['uid', 'question', 'response', 'timestamp', 'type', 'concept_cnt',
       'concept_hot_cnt', 'concept_1', 'concept_2', 'concept_3',
       'concept_4', 'concept_5', 'concept_6', 'content_cnt',
       'kc_group_cnt', 'kc_cnt', 'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5',
       'kc_6', 'kc_7', 'kc_8', 'analysis_cnt', 'time_day', 'time_hour',
       'time_is_workday', 'time_weekday', 'time_year']])
print(columns_tmp)
df_tmp
"""

"\ndf_tmp,df_tmp2, columns_tmp = feature_engineering(df.iloc[:1000].copy()[['uid', 'question', 'response', 'timestamp', 'type', 'concept_cnt',\n       'concept_hot_cnt', 'concept_1', 'concept_2', 'concept_3',\n       'concept_4', 'concept_5', 'concept_6', 'content_cnt',\n       'kc_group_cnt', 'kc_cnt', 'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5',\n       'kc_6', 'kc_7', 'kc_8', 'analysis_cnt', 'time_day', 'time_hour',\n       'time_is_workday', 'time_weekday', 'time_year']], df_test.iloc[:1000].copy()[['uid', 'question', 'response', 'timestamp', 'type', 'concept_cnt',\n       'concept_hot_cnt', 'concept_1', 'concept_2', 'concept_3',\n       'concept_4', 'concept_5', 'concept_6', 'content_cnt',\n       'kc_group_cnt', 'kc_cnt', 'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5',\n       'kc_6', 'kc_7', 'kc_8', 'analysis_cnt', 'time_day', 'time_hour',\n       'time_is_workday', 'time_weekday', 'time_year']])\nprint(columns_tmp)\ndf_tmp\n"

In [27]:
df, df_test, columns = feature_engineering(df, df_test, verbose=1)
gc.collect()

df shape: (5549523, 34)
df shape: (5549523, 114)
df shape: (5549523, 134)
df shape: (5549523, 135)


  iterator = iter(dataframe_groupby)


df shape: (5549523, 145)


  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)
  iterator = iter(dataframe_groupby)


df shape: (5549523, 185)
df shape: (5549523, 189)


0

In [28]:
df, nalist = reduce_mem_usage(df)
df_test, nalist = reduce_mem_usage(df_test)
gc.collect()

Memory usage of properties dataframe is : 5631.670822143555  MB
******************************
Column:  uid
dtype before:  int16
dtype after:  int16
******************************
******************************
Column:  question
dtype before:  int16
dtype after:  int16
******************************
******************************
Column:  response
dtype before:  int8
dtype after:  int8
******************************
******************************
Column:  timestamp
dtype before:  int64
dtype after:  int64
******************************
******************************
Column:  type
dtype before:  int8
dtype after:  int8
******************************
******************************
Column:  concept_cnt
dtype before:  int8
dtype after:  int8
******************************
******************************
Column:  concept_hot_cnt
dtype before:  int8
dtype after:  int8
******************************
******************************
Column:  concept_1
dtype before:  int16
dtype after:  int16
****

dtype after:  float32
******************************
******************************
Column:  concepttype_mean_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_max_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_min_showcnt
dtype before:  float64
dtype after:  int32
******************************
******************************
Column:  concepttype_mean_showcnt
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_max_showcnt
dtype before:  float64
dtype after:  int32
******************************
******************************
Column:  kc_showcnt_1
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  kc_accrate_1
dtype before:  float64
dtype after:  float32
**************************

dtype after:  int32
******************************
******************************
Column:  type_1_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  type_0_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  time_is_workday_1_showcnt
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  time_is_workday_0_showcnt
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  time_is_workday_1_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  time_is_workday_0_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  timestamp_question_cnt
dtype before:  int64
dtype after:  int16
***********************

dtype after:  int8
******************************
******************************
Column:  new_question_days
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  uid_new_question_days
dtype before:  float64
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  2421.4488248825073  MB
This is  42.99698795180723 % of the initial size
nan_list: []
Memory usage of properties dataframe is : 1396.6862487792969  MB
******************************
Column:  uid
dtype before:  int16
dtype after:  int16
******************************
******************************
Column:  question
dtype before:  int16
dtype after:  int16
******************************
******************************
Column:  response
dtype before:  int8
dtype after:  int8
******************************
******************************
Column:  timestamp
dtype before:  int64
dtype after:  int64
**************************

dtype after:  int32
******************************
******************************
Column:  concept_mean_showcnt
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concept_max_showcnt
dtype before:  float64
dtype after:  int32
******************************
******************************
Column:  concepttype_min_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_mean_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_max_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  concepttype_min_showcnt
dtype before:  float64
dtype after:  int32
******************************
******************************
Column:  concepttype_mean_showcnt
dtype before:  float64
dtype after:  float32
*********

dtype after:  int32
******************************
******************************
Column:  time_weekday_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  type_1_showcnt
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  type_0_showcnt
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  type_1_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  type_0_accrate
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  time_is_workday_1_showcnt
dtype before:  int64
dtype after:  int32
******************************
******************************
Column:  time_is_workday_0_showcnt
dtype before:  int64
dtype after:  int32
******************************
********************

dtype after:  float32
******************************
******************************
Column:  uid_new_concept_days_min
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  uid_timestamp_question_meansubmit
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  uid_timestamp_showcnt
dtype before:  int64
dtype after:  int8
******************************
******************************
Column:  new_question_days
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  uid_new_question_days
dtype before:  float64
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  600.5330181121826  MB
This is  42.99698795180723 % of the initial size
nan_list: []


0

### 临时保存

In [29]:
df.reset_index(drop=True).to_feather('./input/df_features_v1214_3.feather')
df_test.reset_index(drop=True).to_feather('./input/df_test_features_v1214_3.feather')

In [3]:
#df = pd.read_feather('./input/df_features_v1214_3.feather')
#df_test = pd.read_feather('./input/df_test_features_v1214_3.feather')
#columns = {'at': ['uid_record_cumsum', 'uid_record_sum', 'question', 'timestamp', 'type', 'concept_cnt', 'concept_hot_cnt', 'concept_1', 'concept_2', 'concept_3', 'concept_4', 'concept_5', 'concept_6', 'content_cnt', 'kc_group_cnt', 'kc_cnt', 'kc_1', 'kc_2', 'kc_3', 'kc_4', 'kc_5', 'kc_6', 'kc_7', 'kc_8', 'analysis_cnt', 'time_day', 'time_hour', 'time_is_workday', 'time_weekday', 'time_year', 'concept_min_accrate', 'concept_min_showcnt', 'concept_mean_accrate', 'concept_mean_showcnt', 'concept_max_accrate', 'concept_max_showcnt', 'concept_showcnt_1', 'concept_accrate_1', 'concept_type_showcnt_1', 'concept_type_accrate_1', 'concept_showcnt_2', 'concept_accrate_2', 'concept_type_showcnt_2', 'concept_type_accrate_2', 'concept_showcnt_3', 'concept_accrate_3', 'concept_type_showcnt_3', 'concept_type_accrate_3', 'concept_showcnt_4', 'concept_accrate_4', 'concept_type_showcnt_4', 'concept_type_accrate_4', 'concept_showcnt_5', 'concept_accrate_5', 'concept_type_showcnt_5', 'concept_type_accrate_5', 'concept_showcnt_6', 'concept_accrate_6', 'concept_type_showcnt_6', 'concept_type_accrate_6', 'concepttype_min_accrate', 'concepttype_min_showcnt', 'concepttype_mean_accrate', 'concepttype_mean_showcnt', 'concepttype_max_accrate', 'concepttype_max_showcnt', 'kc_showcnt_1', 'kc_accrate_1', 'kc_type_showcnt_1', 'kc_type_accrate_1', 'kc_showcnt_2', 'kc_accrate_2', 'kc_type_showcnt_2', 'kc_type_accrate_2', 'kc_showcnt_3', 'kc_accrate_3', 'kc_type_showcnt_3', 'kc_type_accrate_3', 'kc_showcnt_4', 'kc_accrate_4', 'kc_type_showcnt_4', 'kc_type_accrate_4', 'kc_showcnt_5', 'kc_accrate_5', 'kc_type_showcnt_5', 'kc_type_accrate_5', 'kc_showcnt_6', 'kc_accrate_6', 'kc_type_showcnt_6', 'kc_type_accrate_6', 'kc_showcnt_7', 'kc_accrate_7', 'kc_type_showcnt_7', 'kc_type_accrate_7', 'kc_min_accrate', 'kc_min_showcnt', 'kc_mean_accrate', 'kc_mean_showcnt', 'kc_max_accrate', 'kc_max_showcnt', 'kctype_min_accrate', 'kctype_min_showcnt', 'kctype_mean_accrate', 'kctype_mean_showcnt', 'kctype_max_accrate', 'kctype_max_showcnt', 'question_showcnt', 'question_accrate', 'uid_showcnt', 'uid_accrate', 'type_showcnt', 'type_accrate', 'time_hour_showcnt', 'time_hour_accrate', 'time_is_workday_showcnt', 'time_is_workday_accrate', 'time_weekday_showcnt', 'time_weekday_accrate', 'timestamp_question_cnt', 'timestamp_question_accrate', 'question_accrate_without_timestamp_question', 'question_showcnt_without_timestamp_question', 'uid_concept_utilnow_21600_asc', 'uid_concept_utilnow_3600_asc', 'uid_concept_utilnow_94608000_asc', 'uid_concept_utilnow_94608000_desc', 'uid_kc_utilnow_21600_asc', 'uid_kc_utilnow_3600_asc', 'uid_kc_utilnow_94608000_asc', 'uid_kc_utilnow_94608000_desc', 'uid_question_submittimes_21600_asc', 'uid_question_submittimes_3600_asc', 'uid_question_submittimes_94608000_asc', 'uid_question_submittimes_94608000_desc', 'uid_question_utilnow_21600_asc', 'uid_question_utilnow_3600_asc', 'uid_question_utilnow_94608000_asc', 'uid_question_utilnow_94608000_desc', 'uid_timedistance_to_last_submit_asc', 'uid_timedistance_to_last_submit_desc', 'uid_timestamp_question_meansubmit', 'uid_timestamp_question_submit', 'uid_unique_submittimes_21600_asc', 'uid_unique_submittimes_3600_asc', 'uid_unique_submittimes_94608000_asc', 'uid_unique_submittimes_94608000_desc', 'uid_timestamp_showcnt'], 'bs': [], 'bt': [], 'bh': []}

In [4]:
kf_or=['uid_question_utilnow_10800_desc', 'uid_question_submittimes_3600_desc', 'uid_concept_utilnow_1209600_asc', 'uid_question_utilnow_31536000_asc', 'concept_showcnt_2', 'uid_question_submittimes_21600_asc', 'kc_type_showcnt_1', 'cnt_finishsimiliar_contentsimstrict_31536000000', 'concepttype_max_showcnt', 'kc_showcnt_1', 'question_accrate', 'kc_type_accrate_4', 'accrate_similiar_strict_contentsim_86400000', 'concept_accrate_1', 'uid_showcnt', 'acctop5max_similiar_contentsim_-21600000', 'concept_type_accrate_1', 'time_day', 'uid_concept_utilnow_31536000_asc', 'kc_type_accrate_5', 'cnt_finishstrictsimiliar_-21600000', 'uid_new_concept_days_min', 'time_weekday_showcnt', 'cnt_accstrictsimiliar_31536000000', 'time_weekday_accrate', 'uid_question_utilnow_259200_asc', 'acctop5_finishstrictsimiliar_contentsimstrict_-1209600000', 'concept_5', 'kc_accrate_2', 'uid_concept_utilnow_21600_asc', 'type_accrate', 'uid_unique_submittimes_21600_asc', 'kc_showcnt_6', 'cnt_accsimiliar_contentsimstrict_1209600000', 'cnt_finishstrictsimiliar_contentsimstrict_-21600000', 'concept_2', 'cnt_accsimiliar_contentsim_1209600000', 'concepttype_min_accrate', 'kc_type_showcnt_5', 'uid_question_utilnow_3600_desc', 'kc_4', 'acctop5_finishsimiliar_contentsim_31536000000', 'kc_6', 'uid_timedistance_to_last_submit_desc', 'time_is_workday_0_accrate', 'uid_accrate', 'acctop1max_similiar_strict_-21600000', 'concept_cnt', 'kc_max_accrate', 'concept_showcnt_3', 'uid_concept_utilnow_21600_desc', 'uid_question_utilnow_1209600_desc', 'kc_type_accrate_7', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'acctop5_finishstrictsimiliar_contentsim_-21600000', 'kc_type_showcnt_4', 'new_concept_days_min', 'acctop5max_similiar_strict_3600000', 'uid_concept_type_accrate_2', 'uid_kc_accrate_8', 'kctype_max_showcnt', 'acctop5_accsimiliar_contentsim_31536000000', 'accrate_similiar_contentsimstrict_3600000', 'kctype_min_showcnt', 'concept_hot_cnt', 'kc_accrate_5', 'accrate_similiar_strict_contentsimstrict_1209600000', 'uid_concept_type_accrate_4', 'concepttype_min_showcnt', 'acctop5max_similiar_contentsim_31536000000', 'concept_type_showcnt_6', 'concept_type_showcnt_5', 'concept_type_showcnt_1', 'kc_type_accrate_2', 'uidconcepttype_max_accrate', 'acctop5_finishsimiliar_contentsimstrict_86400000', 'concept_1', 'uid_timestamp_question_meansubmit', 'kc_group_cnt', 'acctop5max_similiar_-21600000', 'kc_type_accrate_1', 'concept_4', 'uid_timestamp_showcnt', 'uid_question_submittimes_1209600_desc', 'concepttype_max_accrate', 'concept_mean_showcnt', 'uid_timestamp_question_submit', 'kc_5', 'concepttype_mean_showcnt', 'kctype_min_accrate', 'acctop5max_similiar_strict_-21600000', 'concept_type_showcnt_3', 'uid_question_utilnow_21600_desc', 'uid_concept_type_accrate_5', 'timestamp_question_cnt', 'kc_1', 'concept_showcnt_1', 'kctype_max_accrate', 'uid_kc_accrate_4', 'concept_type_accrate_2', 'concept_type_showcnt_2', 'uid_unique_submittimes_10800_desc', 'timediff1', 'concept_accrate_4', 'question_showcnt', 'uid_timedistance_to_last_submit_asc', 'kc_type_accrate_8', 'kc_type_showcnt_3', 'kc_mean_showcnt', 'acctop5max_similiar_contentsimstrict_-21600000', 'uid_record_cumsum', 'acctop5max_similiar_strict_contentsim_31536000000', 'kc_min_accrate', 'kctype_mean_showcnt', 'question', 'concept_mean_accrate', 'uid_concept_accrate_4', 'uidkc_mean_accrate', 'timediff-1', 'uid_concept_utilnow_259200_desc', 'accrate_similiar_86400000', 'uid_kc_accrate_1', 'kctype_mean_accrate', 'analysis_cnt', 'acctop5max_similiar_strict_contentsim_86400000', 'uid_question_utilnow_21600_asc', 'kc_accrate_4', 'concept_min_showcnt', 'kc_type_showcnt_6', 'concept_showcnt_4', 'acctop1_accstrictsimiliar_86400000', 'uid_concept_type_accrate_1', 'uid_question_utilnow_1209600_asc', 'kc_max_showcnt', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'concept_max_accrate', 'kc_2', 'concept_type_accrate_4', 'time_hour', 'concept_accrate_2', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'content_cnt', 'kc_type_accrate_3', 'concept_type_accrate_3', 'timestamp', 'kc_3', 'new_question_days', 'acctop5_finishstrictsimiliar_-21600000', 'time_year', 'kc_cnt', 'uid_question_utilnow_1314000_desc', 'acctop5max_similiar_strict_contentsim_-21600000', 'uid_concept_utilnow_1209600_desc', 'uid_concept_type_accrate_6', 'acctop1_finishstrictsimiliar_contentsimstrict_3600000', 'uid_kc_accrate_2', 'kc_showcnt_4', 'kc_type_showcnt_2', 'kc_type_showcnt_8', 'uid_question_utilnow_3600_asc', 'uid_unique_submittimes_21600_desc', 'concept_type_accrate_6', 'uid_question_submittimes_3600_asc', 'kc_accrate_1']
kf0=['kc_type_accrate_1', 'concept_showcnt_3', 'cnt_finishstrictsimiliar_-21600000', 'acctop5_finishsimiliar_contentsimstrict_86400000', 'kc_6', 'uid_accrate', 'concept_type_showcnt_5', 'kc_type_showcnt_6', 'time_weekday_accrate', 'acctop5max_similiar_contentsim_-21600000', 'accrate_similiar_strict_contentsimstrict_1209600000', 'concept_cnt', 'kc_type_showcnt_8', 'concept_type_showcnt_1', 'uid_concept_utilnow_21600_asc', 'kc_2', 'kc_accrate_5', 'uid_unique_submittimes_21600_desc', 'uid_kc_accrate_4', 'concepttype_min_showcnt', 'concept_accrate_1', 'accrate_similiar_strict_contentsim_86400000', 'acctop1max_similiar_strict_-21600000', 'uid_concept_utilnow_259200_desc', 'uid_question_utilnow_21600_asc', 'kc_type_showcnt_4', 'uid_kc_accrate_8', 'uid_concept_type_accrate_4', 'acctop5_finishstrictsimiliar_contentsim_-21600000', 'kc_type_accrate_4', 'kc_3', 'uid_unique_submittimes_21600_asc', 'concept_type_accrate_3', 'uid_kc_accrate_2', 'kctype_mean_showcnt', 'uid_question_utilnow_31536000_asc', 'question_accrate', 'acctop5max_similiar_strict_contentsim_86400000', 'concept_accrate_4', 'kc_type_accrate_7', 'uid_timestamp_showcnt', 'kc_type_showcnt_2', 'uid_question_utilnow_3600_desc', 'acctop5_finishstrictsimiliar_contentsimstrict_-1209600000', 'acctop5max_similiar_strict_-21600000', 'uid_concept_accrate_4', 'time_hour', 'kctype_min_accrate', 'concept_max_accrate', 'kc_group_cnt', 'accrate_similiar_contentsimstrict_3600000', 'uid_concept_utilnow_21600_desc', 'uid_timestamp_question_meansubmit', 'acctop5_accsimiliar_contentsim_31536000000', 'concept_mean_showcnt', 'concept_1', 'uid_timedistance_to_last_submit_desc', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'time_year', 'time_weekday_showcnt', 'cnt_accsimiliar_contentsimstrict_1209600000', 'kc_min_accrate', 'uid_question_submittimes_3600_asc', 'uid_showcnt', 'concepttype_mean_showcnt', 'cnt_finishsimiliar_contentsimstrict_31536000000', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'acctop1_finishstrictsimiliar_contentsimstrict_3600000', 'uidconcepttype_max_accrate', 'uid_question_utilnow_21600_desc', 'kc_type_accrate_8', 'uid_concept_utilnow_1209600_asc', 'kc_type_showcnt_1', 'timediff1', 'uid_concept_utilnow_1209600_desc', 'concept_type_showcnt_2', 'uid_question_utilnow_3600_asc', 'uid_concept_type_accrate_2', 'analysis_cnt', 'concepttype_max_showcnt', 'uid_question_utilnow_10800_desc', 'concepttype_min_accrate', 'kctype_max_showcnt', 'kctype_min_showcnt', 'timestamp_question_cnt', 'concept_type_accrate_4', 'concept_2', 'kc_type_accrate_3', 'concepttype_max_accrate', 'concept_5', 'kc_cnt', 'kc_max_accrate', 'acctop5_finishstrictsimiliar_-21600000', 'uid_question_utilnow_1209600_desc', 'uid_question_utilnow_1314000_desc', 'type_accrate', 'uid_unique_submittimes_10800_desc', 'concept_hot_cnt', 'cnt_finishstrictsimiliar_contentsimstrict_-21600000', 'uid_question_utilnow_1209600_asc', 'kc_type_accrate_2', 'concept_min_showcnt', 'kc_5', 'kc_accrate_1', 'acctop5max_similiar_-21600000', 'acctop5max_similiar_strict_contentsim_31536000000', 'kc_accrate_4', 'concept_type_accrate_6', 'kc_type_showcnt_3', 'uid_question_submittimes_21600_asc', 'question_showcnt', 'kctype_max_accrate', 'content_cnt', 'uid_new_concept_days_min', 'acctop5max_similiar_contentsim_31536000000', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'concept_4', 'question', 'kc_type_showcnt_5', 'uidkc_mean_accrate', 'acctop5max_similiar_contentsimstrict_-21600000', 'kc_max_showcnt', 'acctop5_finishsimiliar_contentsim_31536000000', 'acctop1_accstrictsimiliar_86400000', 'acctop5max_similiar_strict_3600000', 'kc_accrate_2', 'uid_concept_utilnow_31536000_asc', 'kc_showcnt_4', 'kc_showcnt_6', 'kc_4', 'uid_question_utilnow_259200_asc', 'kc_type_accrate_5', 'kc_mean_showcnt', 'kc_showcnt_1', 'kc_1', 'new_concept_days_min', 'new_question_days', 'uid_timedistance_to_last_submit_asc', 'uid_question_submittimes_1209600_desc', 'kctype_mean_accrate', 'timestamp', 'uid_concept_type_accrate_6', 'accrate_similiar_86400000', 'cnt_accsimiliar_contentsim_1209600000', 'uid_kc_accrate_1', 'cnt_accstrictsimiliar_31536000000', 'acctop5max_similiar_strict_contentsim_-21600000', 'uid_question_submittimes_3600_desc']
kf1=['concepttype_min_accrate', 'kc_3', 'concept_cnt', 'uid_question_utilnow_1314000_desc', 'acctop5max_similiar_contentsim_31536000000', 'kc_type_showcnt_4', 'acctop5max_similiar_strict_3600000', 'acctop5max_similiar_-21600000', 'cnt_accsimiliar_contentsimstrict_1209600000', 'kc_accrate_2', 'concept_type_showcnt_6', 'uid_concept_accrate_4', 'kc_showcnt_6', 'uid_question_submittimes_1209600_desc', 'timestamp', 'concept_type_accrate_6', 'kc_mean_showcnt', 'kctype_min_accrate', 'acctop5_finishstrictsimiliar_-21600000', 'uid_timedistance_to_last_submit_asc', 'kc_accrate_4', 'uid_question_utilnow_1209600_asc', 'uid_record_cumsum', 'concept_showcnt_4', 'concept_4', 'kc_type_showcnt_1', 'uid_concept_utilnow_259200_desc', 'timediff-1', 'kc_4', 'uidconcepttype_max_accrate', 'kc_accrate_1', 'time_is_workday_0_accrate', 'kctype_mean_showcnt', 'kc_5', 'uid_question_utilnow_31536000_asc', 'kctype_mean_accrate', 'kc_type_accrate_3', 'acctop5_finishstrictsimiliar_contentsimstrict_-1209600000', 'concept_accrate_4', 'kc_type_showcnt_2', 'concept_5', 'uid_question_utilnow_10800_desc', 'acctop5max_similiar_strict_contentsim_86400000', 'uid_kc_accrate_8', 'concept_max_accrate', 'concept_hot_cnt', 'kc_type_accrate_2', 'uid_kc_accrate_2', 'concept_2', 'uid_question_submittimes_3600_asc', 'uid_question_utilnow_3600_desc', 'concept_accrate_2', 'uid_concept_type_accrate_2', 'concepttype_min_showcnt', 'kc_2', 'uid_accrate', 'uid_question_utilnow_21600_desc', 'time_year', 'concept_type_accrate_4', 'kc_type_accrate_7', 'uid_concept_type_accrate_6', 'cnt_accstrictsimiliar_31536000000', 'new_question_days', 'kc_type_showcnt_5', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'uid_concept_utilnow_21600_desc', 'kc_type_showcnt_6', 'concept_showcnt_1', 'kc_type_showcnt_8', 'concepttype_max_accrate', 'acctop5_accsimiliar_contentsim_31536000000', 'kc_showcnt_1', 'question_accrate', 'uid_timestamp_question_meansubmit', 'kc_accrate_5', 'kc_cnt', 'uid_concept_utilnow_1209600_asc', 'new_concept_days_min', 'cnt_finishsimiliar_contentsimstrict_31536000000', 'concept_accrate_1', 'acctop5_finishstrictsimiliar_contentsim_-21600000', 'uid_unique_submittimes_21600_desc', 'time_weekday_accrate', 'kc_type_accrate_4', 'concept_1', 'concept_mean_showcnt', 'uid_timedistance_to_last_submit_desc', 'accrate_similiar_strict_contentsim_86400000', 'time_day', 'concept_showcnt_2', 'uid_timestamp_question_submit', 'uid_concept_type_accrate_5', 'acctop5max_similiar_strict_contentsim_-21600000', 'kc_type_showcnt_3', 'kc_type_accrate_8', 'uid_question_submittimes_21600_asc', 'uid_kc_accrate_4', 'concept_type_showcnt_1', 'content_cnt', 'concept_mean_accrate', 'kc_max_accrate', 'analysis_cnt', 'kc_max_showcnt', 'concepttype_mean_showcnt', 'type_accrate', 'uid_concept_utilnow_31536000_asc', 'question_showcnt', 'timestamp_question_cnt', 'uidkc_mean_accrate', 'kc_showcnt_4', 'kc_type_accrate_1', 'acctop5max_similiar_contentsimstrict_-21600000', 'uid_unique_submittimes_10800_desc', 'kc_6', 'kc_1', 'acctop5max_similiar_strict_contentsim_31536000000', 'uid_unique_submittimes_21600_asc', 'uid_question_submittimes_3600_desc', 'kctype_max_showcnt', 'accrate_similiar_86400000', 'concept_type_accrate_2', 'cnt_finishstrictsimiliar_-21600000', 'acctop5_finishsimiliar_contentsimstrict_86400000', 'kctype_max_accrate', 'concept_showcnt_3', 'uid_timestamp_showcnt', 'uid_concept_type_accrate_1', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'acctop1_finishstrictsimiliar_contentsimstrict_3600000', 'concept_type_showcnt_5', 'question', 'concept_type_accrate_1', 'acctop1_accstrictsimiliar_86400000', 'concept_type_showcnt_3', 'accrate_similiar_contentsimstrict_3600000', 'uid_concept_utilnow_21600_asc', 'acctop1max_similiar_strict_-21600000', 'kc_type_accrate_5', 'kctype_min_showcnt', 'concepttype_max_showcnt', 'uid_concept_type_accrate_4', 'uid_question_utilnow_21600_asc', 'cnt_accsimiliar_contentsim_1209600000', 'uid_question_utilnow_3600_asc', 'uid_new_concept_days_min', 'acctop5_finishsimiliar_contentsim_31536000000', 'uid_question_utilnow_1209600_desc', 'timediff1', 'uid_question_utilnow_259200_asc', 'concept_type_accrate_3', 'kc_group_cnt', 'cnt_finishstrictsimiliar_contentsimstrict_-21600000', 'kc_min_accrate', 'time_weekday_showcnt', 'uid_concept_utilnow_1209600_desc', 'acctop5max_similiar_strict_-21600000', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'time_hour']
kf9=['kc_showcnt_6', 'uid_kc_accrate_8', 'acctop5max_similiar_-21600000', 'acctop5max_similiar_contentsim_31536000000', 'uid_timestamp_question_meansubmit', 'type_accrate', 'acctop1max_similiar_strict_-21600000', 'uid_kc_accrate_4', 'kctype_max_showcnt', 'acctop5_finishstrictsimiliar_-21600000', 'concepttype_mean_showcnt', 'content_cnt', 'uid_new_concept_days_min', 'kc_1', 'kc_cnt', 'kc_3', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'concept_hot_cnt', 'acctop5_finishstrictsimiliar_contentsim_-21600000', 'acctop1_finishstrictsimiliar_contentsimstrict_3600000', 'uid_unique_submittimes_21600_asc', 'uid_question_utilnow_1209600_desc', 'kc_type_accrate_7', 'new_question_days', 'uid_timedistance_to_last_submit_desc', 'cnt_accsimiliar_contentsim_1209600000', 'kc_showcnt_1', 'accrate_similiar_contentsimstrict_3600000', 'uid_question_utilnow_259200_asc', 'timestamp_question_cnt', 'uid_concept_utilnow_1209600_asc', 'kc_accrate_4', 'uid_unique_submittimes_21600_desc', 'uid_concept_accrate_4', 'acctop5max_similiar_contentsimstrict_-21600000', 'uid_concept_utilnow_21600_desc', 'uid_question_utilnow_10800_desc', 'kctype_min_showcnt', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'acctop5max_similiar_strict_3600000', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'kc_type_showcnt_6', 'concepttype_min_accrate', 'acctop1_accstrictsimiliar_86400000', 'uid_concept_type_accrate_4', 'kc_5', 'kc_type_accrate_4', 'question_showcnt', 'concept_max_accrate', 'acctop5_accsimiliar_contentsim_31536000000', 'timediff1', 'uid_kc_accrate_2', 'kc_2', 'uid_question_utilnow_1209600_asc', 'kc_type_accrate_2', 'kc_mean_showcnt', 'kc_min_accrate', 'analysis_cnt', 'kc_type_showcnt_4', 'uid_concept_utilnow_31536000_asc', 'acctop5_finishsimiliar_contentsim_31536000000', 'uidconcepttype_max_accrate', 'cnt_accsimiliar_contentsimstrict_1209600000', 'kc_accrate_5', 'kc_accrate_1', 'accrate_similiar_86400000', 'concept_2', 'acctop5max_similiar_strict_contentsim_86400000', 'concept_1', 'kctype_mean_showcnt', 'kc_group_cnt', 'concept_type_accrate_6', 'acctop5_finishsimiliar_contentsimstrict_86400000', 'acctop5max_similiar_strict_contentsim_31536000000', 'cnt_finishstrictsimiliar_contentsimstrict_-21600000', 'concept_type_accrate_4', 'question', 'kctype_max_accrate', 'uid_question_utilnow_1314000_desc', 'concept_mean_showcnt', 'kc_type_showcnt_2', 'acctop5max_similiar_strict_-21600000', 'kc_6', 'kc_showcnt_4', 'kctype_min_accrate', 'uid_concept_utilnow_259200_desc', 'uid_question_utilnow_3600_asc', 'new_concept_days_min', 'concepttype_min_showcnt', 'question_accrate', 'uid_question_utilnow_3600_desc', 'time_weekday_accrate', 'kc_type_showcnt_1', 'uid_question_submittimes_1209600_desc', 'kctype_mean_accrate', 'uid_question_utilnow_21600_asc', 'concept_accrate_4', 'uid_question_utilnow_21600_desc', 'kc_max_accrate', 'kc_type_accrate_3', 'kc_type_showcnt_5', 'uid_question_submittimes_3600_asc', 'concept_min_showcnt', 'uid_concept_type_accrate_6', 'concept_type_showcnt_5', 'time_hour', 'cnt_accstrictsimiliar_31536000000', 'kc_type_accrate_8', 'uid_unique_submittimes_10800_desc', 'uid_concept_type_accrate_2', 'kc_type_accrate_1', 'uid_question_submittimes_21600_asc', 'kc_type_showcnt_8', 'cnt_finishsimiliar_contentsimstrict_31536000000', 'time_weekday_showcnt', 'concept_4', 'uid_concept_utilnow_1209600_desc', 'acctop5max_similiar_contentsim_-21600000', 'concept_type_showcnt_1', 'kc_4', 'time_year', 'uid_accrate', 'uid_question_submittimes_3600_desc', 'uidkc_mean_accrate', 'accrate_similiar_strict_contentsim_86400000', 'concept_cnt', 'uid_question_utilnow_31536000_asc', 'concept_showcnt_3', 'cnt_finishstrictsimiliar_-21600000', 'concepttype_max_showcnt', 'kc_accrate_2', 'uid_timedistance_to_last_submit_asc', 'uid_timestamp_showcnt', 'timestamp', 'concepttype_max_accrate', 'concept_type_accrate_3', 'accrate_similiar_strict_contentsimstrict_1209600000', 'uid_showcnt', 'kc_type_showcnt_3', 'uid_kc_accrate_1', 'concept_5', 'kc_max_showcnt', 'concept_type_showcnt_2', 'acctop5_finishstrictsimiliar_contentsimstrict_-1209600000']
kf_and=['concepttype_max_accrate', 'concept_mean_showcnt', 'kc_5', 'concepttype_mean_showcnt', 'kctype_min_accrate', 'uid_question_utilnow_10800_desc', 'acctop5max_similiar_strict_-21600000', 'uid_question_submittimes_3600_desc', 'uid_question_utilnow_21600_desc', 'uid_concept_utilnow_1209600_asc', 'uid_question_utilnow_31536000_asc', 'uid_question_submittimes_21600_asc', 'kc_type_showcnt_1', 'cnt_finishsimiliar_contentsimstrict_31536000000', 'concepttype_max_showcnt', 'timestamp_question_cnt', 'kc_1', 'kctype_max_accrate', 'kc_showcnt_1', 'uid_kc_accrate_4', 'question_accrate', 'uid_unique_submittimes_10800_desc', 'kc_type_accrate_4', 'timediff1', 'accrate_similiar_strict_contentsim_86400000', 'concept_accrate_4', 'question_showcnt', 'uid_timedistance_to_last_submit_asc', 'kc_type_accrate_8', 'uid_concept_utilnow_31536000_asc', 'kc_type_showcnt_3', 'cnt_finishstrictsimiliar_-21600000', 'kc_mean_showcnt', 'acctop5max_similiar_contentsimstrict_-21600000', 'uid_new_concept_days_min', 'acctop5max_similiar_strict_contentsim_31536000000', 'kc_min_accrate', 'kctype_mean_showcnt', 'question', 'time_weekday_showcnt', 'uid_concept_accrate_4', 'uidkc_mean_accrate', 'cnt_accstrictsimiliar_31536000000', 'time_weekday_accrate', 'uid_concept_utilnow_259200_desc', 'uid_question_utilnow_259200_asc', 'acctop5_finishstrictsimiliar_contentsimstrict_-1209600000', 'concept_5', 'kc_accrate_2', 'accrate_similiar_86400000', 'type_accrate', 'uid_unique_submittimes_21600_asc', 'kctype_mean_accrate', 'analysis_cnt', 'kc_showcnt_6', 'acctop5max_similiar_strict_contentsim_86400000', 'cnt_accsimiliar_contentsimstrict_1209600000', 'uid_question_utilnow_21600_asc', 'cnt_finishstrictsimiliar_contentsimstrict_-21600000', 'concept_2', 'kc_accrate_4', 'cnt_accsimiliar_contentsim_1209600000', 'kc_type_showcnt_6', 'concepttype_min_accrate', 'kc_type_showcnt_5', 'acctop1_accstrictsimiliar_86400000', 'uid_question_utilnow_3600_desc', 'uid_question_utilnow_1209600_asc', 'kc_4', 'acctop5_finishsimiliar_contentsim_31536000000', 'kc_max_showcnt', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'kc_6', 'uid_timedistance_to_last_submit_desc', 'concept_max_accrate', 'uid_accrate', 'kc_2', 'concept_type_accrate_4', 'uid_question_submittimes_1209600_desc', 'acctop1max_similiar_strict_-21600000', 'time_hour', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'concept_cnt', 'kc_max_accrate', 'concept_showcnt_3', 'uid_concept_utilnow_21600_desc', 'content_cnt', 'kc_type_accrate_3', 'uid_question_utilnow_1209600_desc', 'concept_type_accrate_3', 'timestamp', 'kc_type_accrate_7', 'kc_3', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'new_question_days', 'acctop5_finishstrictsimiliar_contentsim_-21600000', 'acctop5_finishstrictsimiliar_-21600000', 'time_year', 'kc_cnt', 'uid_question_utilnow_1314000_desc', 'kc_type_showcnt_4', 'new_concept_days_min', 'uid_concept_utilnow_1209600_desc', 'uid_concept_type_accrate_2', 'acctop5max_similiar_strict_3600000', 'uid_kc_accrate_8', 'kctype_max_showcnt', 'acctop5_accsimiliar_contentsim_31536000000', 'uid_concept_type_accrate_6', 'accrate_similiar_contentsimstrict_3600000', 'kctype_min_showcnt', 'concept_hot_cnt', 'kc_accrate_5', 'acctop1_finishstrictsimiliar_contentsimstrict_3600000', 'uid_concept_type_accrate_4', 'concepttype_min_showcnt', 'acctop5max_similiar_contentsim_31536000000', 'concept_type_showcnt_5', 'concept_type_showcnt_1', 'uid_kc_accrate_2', 'kc_type_accrate_2', 'kc_showcnt_4', 'kc_type_showcnt_2', 'uidconcepttype_max_accrate', 'acctop5_finishsimiliar_contentsimstrict_86400000', 'kc_type_showcnt_8', 'concept_1', 'uid_question_utilnow_3600_asc', 'uid_unique_submittimes_21600_desc', 'uid_timestamp_question_meansubmit', 'kc_group_cnt', 'concept_4', 'acctop5max_similiar_-21600000', 'kc_type_accrate_1', 'concept_type_accrate_6', 'uid_timestamp_showcnt', 'uid_question_submittimes_3600_asc', 'kc_accrate_1']
# 可能要用到的局部特征
columns_sim_res = [[],[]]
for col in kf_or:
    if col not in df.columns:
        if '00' in col:
            columns_sim_res[1].append(col)
        else:
            columns_sim_res[0].append(col)
columns_sim_res[0] = sorted(columns_sim_res[0])
columns_sim_res[1] = sorted(columns_sim_res[1], key=lambda x:int(x.split('_')[-1]))
columns_sim_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1]])

### 相似问题的历史特征

In [5]:
def get_sim_metrix(df, df_test, df_val=None):
    global question_interact_ok,question_interact,question_interact_true_ok,question_interact_true
    global question_similiar_reverse,question_true_similiar_reverse,question_similiar_reverse_strict,question_true_similiar_reverse_strict
    global question_info_sim,question_info_sim_strict
    max_question_id = 7651
        
    useful_cols = ['response','question','uid']
    if df_val is not None:
        df_all_response = pd.concat([df[df['response']!=-1],df_test[df_test['response']!=-1],df_val[df_val['response']!=-1]], axis=0).reset_index(drop=True)
    else:
        df_all_response = pd.concat([df[df['response']!=-1],df_test[df_test['response']!=-1]], axis=0)

    question_accrate = df_all_response.groupby('question')['response'].mean()
    question_interact_ok = np.zeros([max_question_id+1, max_question_id+1])
    question_interact = np.zeros([max_question_id+1, max_question_id+1])
    question_interact_true_ok = np.zeros([max_question_id+1, max_question_id+1])
    question_interact_true = np.zeros([max_question_id+1, max_question_id+1])
    
    last_uid = -1
    finished_questions = []
    true_questions = []
    for idx, (uid, question, response) in df_all_response[['uid', 'question', 'response']].iterrows():
        if uid != last_uid:
            last_uid = uid
            finished_questions = [question]
            true_questions = [question] if response==1 else []
            continue
        #for i in finished_questions:
        question_interact[finished_questions, question] += 1
        #for i in true_questions:
        question_interact_true[true_questions, question] += 1
        if response == 1:
            #for i in finished_questions:
            question_interact_ok[finished_questions, question] += 1
            #for i in true_questions:
            question_interact_true_ok[true_questions, question] += 1
            true_questions.append(question)
        finished_questions.append(question)

    # 相似题目(i->j), A-》B是否能做对；A做过之后，是否做了B
    question_interact_accrate = question_interact_ok/(question_interact+0.00001)*(question_interact>20)
    question_interact_true_accrate = question_interact_true_ok/(question_interact_true+0.00001)*(question_interact_true>20)
    #question_similiar = [{j:acc for j, acc in enumerate(question_interact_accrate[i,:]) if acc>0.95 and (1-question_accrate[j])*0.5>(1-acc)} for i in range(len(question_interact_accrate))]
    question_similiar_reverse = [{j:acc for j, acc in enumerate(question_interact_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.5>(1-acc)} for i in range(len(question_interact_accrate))]
    #question_true_similiar = [{j:acc for j, acc in enumerate(question_interact_true_accrate[i,:]) if acc>0.95 and (1-question_accrate[j])*0.5>(1-acc)} for i in range(len(question_interact_true_accrate))]
    question_true_similiar_reverse = [{j:acc for j, acc in enumerate(question_interact_true_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.5>(1-acc)} for i in range(len(question_interact_true_accrate))]
    question_interact_accrate = question_interact_ok/(question_interact+0.00001)*(question_interact>40)
    question_interact_true_accrate = question_interact_true_ok/(question_interact_true+0.00001)*(question_interact_true>40)
    question_similiar_reverse_strict = [{j:acc for j, acc in enumerate(question_interact_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.25>(1-acc)} for i in range(len(question_interact_accrate))]
    question_true_similiar_reverse_strict = [{j:acc for j, acc in enumerate(question_interact_true_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.25>(1-acc)} for i in range(len(question_interact_true_accrate))]

In [6]:
df_all = pd.concat([df,df_test], axis=0)
df_all_response = pd.concat([df,df_test[df_test['response']!=-1]], axis=0)

question_info = pd.concat([df,df_test],axis=0).groupby('question')[['concept_1','concept_2','concept_3','concept_4','concept_5','concept_6']].max()
question_accrate = pd.concat([df,df_test[df_test['response']!=-1]],axis=0).groupby('question')['response'].mean()

question_info_dict = [set() for _ in range(7652)]
for i, (c1,c2,c3,c4,c5,c6) in question_info.iterrows():
    question_info_dict[i] = set([x for x in [c1,c2,c3,c4,c5,c6] if x != -1])
question_info_sim = [set() for _ in range(7652)]
question_info_sim_strict = [set() for _ in range(7652)]  # 目前要做j题，求j的相似，幂等
for i in range(7652):
    for j in range(i+1, 7652):
        if question_info_dict[i] == question_info_dict[j]:
            question_info_sim_strict[i].add(j)
            question_info_sim_strict[j].add(i)
        if question_info_dict[i] == question_info_dict[j] or ((len(question_info_dict[i])>1 or len(question_info_dict[j])>1) and len(question_info_dict[i]-question_info_dict[j])<=1 and len(question_info_dict[j]-question_info_dict[i])<=1):
            question_info_sim[i].add(j)
            question_info_sim[j].add(i)
question_accrate = df_all_response.groupby('question')['response'].mean()
question_interact_ok = np.zeros([df_all['question'].max()+1, df_all['question'].max()+1])
question_interact = np.zeros([df_all['question'].max()+1, df_all['question'].max()+1])
question_interact_true_ok = np.zeros([df_all['question'].max()+1, df_all['question'].max()+1])
question_interact_true = np.zeros([df_all['question'].max()+1, df_all['question'].max()+1])

del df_all,df_all_response #, question_accrate, question_interact_ok, question_interact, question_interact_true_ok, question_interact_true
gc.collect()

0

In [14]:
## 注意去除value_counts 特别小的question，太抖
df_all_response = pd.concat([df,df_test[df_test['response']!=-1]], axis=0)
question_accrate = df_all_response.groupby('question')['response'].mean()

last_uid = -1
finished_questions = []
true_questions = []
for idx, (uid, question, response) in tqdm(df_all_response[['uid', 'question', 'response']].iterrows()):
    if uid != last_uid:
        last_uid = uid
        finished_questions = [question]
        true_questions = [question] if response==1 else []
        continue
    #for i in finished_questions:
    question_interact[finished_questions, question] += 1
    #for i in true_questions:
    question_interact_true[true_questions, question] += 1
    if response == 1:
        #for i in finished_questions:
        question_interact_ok[finished_questions, question] += 1
        #for i in true_questions:
        question_interact_true_ok[true_questions, question] += 1
        true_questions.append(question)
    finished_questions.append(question)
   
# 相似题目(i->j)
question_interact_accrate = question_interact_ok/(question_interact+0.00001)*(question_interact>20)
question_interact_true_accrate = question_interact_true_ok/(question_interact_true+0.00001)*(question_interact_true>20)
#question_similiar = [{j:acc for j, acc in enumerate(question_interact_accrate[i,:]) if acc>0.95 and (1-question_accrate[j])*0.5>(1-acc)} for i in range(len(question_interact_accrate))]
question_similiar_reverse = [{j:acc for j, acc in enumerate(question_interact_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.5>(1-acc)} for i in range(len(question_interact_accrate))]
#question_true_similiar = [{j:acc for j, acc in enumerate(question_interact_true_accrate[i,:]) if acc>0.95 and (1-question_accrate[j])*0.5>(1-acc)} for i in range(len(question_interact_true_accrate))]
question_true_similiar_reverse = [{j:acc for j, acc in enumerate(question_interact_true_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.5>(1-acc)} for i in range(len(question_interact_true_accrate))]
question_interact_accrate = question_interact_ok/(question_interact+0.00001)*(question_interact>40)
question_interact_true_accrate = question_interact_true_ok/(question_interact_true+0.00001)*(question_interact_true>40)
question_similiar_reverse_strict = [{j:acc for j, acc in enumerate(question_interact_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.25>(1-acc)} for i in range(len(question_interact_accrate))]
question_true_similiar_reverse_strict = [{j:acc for j, acc in enumerate(question_interact_true_accrate[:,i]) if acc>0.95 and (1-question_accrate[i])*0.25>(1-acc)} for i in range(len(question_interact_true_accrate))]

del df_all_response #, question_accrate, question_interact_ok, question_interact, question_interact_true_ok, question_interact_true
gc.collect()

4997233it [30:52, 2697.83it/s]


0

In [7]:
"""
import pickle
with open('./input/question_similiar2.pickle','wb+') as f:
    tmp = {
        'question_similiar_reverse':question_similiar_reverse,
        'question_true_similiar_reverse':question_true_similiar_reverse,
        'question_similiar_reverse_strict':question_similiar_reverse_strict,
        'question_true_similiar_reverse_strict':question_true_similiar_reverse_strict
    }
    pickle.dump(tmp, f)
"""

"\nimport pickle\nwith open('./input/question_similiar2.pickle','wb+') as f:\n    tmp = {\n        'question_similiar_reverse':question_similiar_reverse,\n        'question_true_similiar_reverse':question_true_similiar_reverse,\n        'question_similiar_reverse_strict':question_similiar_reverse_strict,\n        'question_true_similiar_reverse_strict':question_true_similiar_reverse_strict\n    }\n    pickle.dump(tmp, f)\n"

In [8]:
# import pickle
# with open('./input/question_similiar2.pickle','rb') as f:
#     tmp = pickle.load(f)
# question_similiar_reverse = tmp['question_similiar_reverse']
# question_true_similiar_reverse = tmp['question_true_similiar_reverse']
# question_similiar_reverse_strict = tmp['question_similiar_reverse_strict']
# question_true_similiar_reverse_strict = tmp['question_true_similiar_reverse_strict']

### 局部特征-sim

In [7]:
# 将测试集的已有信息也放进来（特征工程和训练集）
useful_features_dtypes = {'uid': np.dtype('int16'), 'question': np.dtype('int16'), 'response': np.dtype('int8'), 'timestamp': np.dtype('int64'), 'type': np.dtype('int8'), 'concept_cnt': np.dtype('int8'), 'concept_hot_cnt': np.dtype('int8'), 'concept_1': np.dtype('int16'), 'concept_2': np.dtype('int16'), 'concept_3': np.dtype('int16'), 'concept_4': np.dtype('int16'), 'concept_5': np.dtype('int16'), 'concept_6': np.dtype('int16'), 'content_cnt': np.dtype('int16'), 'kc_group_cnt': np.dtype('int8'), 'kc_cnt': np.dtype('int8'), 'kc_1': np.dtype('int16'), 'kc_2': np.dtype('int16'), 'kc_3': np.dtype('int16'), 'kc_4': np.dtype('int16'), 'kc_5': np.dtype('int16'), 'kc_6': np.dtype('int16'), 'kc_7': np.dtype('int16'), 'kc_8': np.dtype('int8'), 'analysis_cnt': np.dtype('int16'), 'time_day': np.dtype('int16'), 'time_hour': np.dtype('int8'), 'time_is_workday': np.dtype('int8'), 'time_weekday': np.dtype('int8'), 'time_year': np.dtype('int16'), 'data_type': np.dtype('O'), 'uid_record_cumsum': np.dtype('int16'), 'uid_record_sum': np.dtype('int16'), 'concept_showcnt_1': np.dtype('int32'), 'concept_accrate_1': np.dtype('float32'), 'concept_showcnt_2': np.dtype('int32'), 'concept_accrate_2': np.dtype('float32'), 'concept_showcnt_3': np.dtype('int32'), 'concept_accrate_3': np.dtype('float32'), 'concept_showcnt_4': np.dtype('int16'), 'concept_accrate_4': np.dtype('float32'), 'concept_showcnt_5': np.dtype('int16'), 'concept_accrate_5': np.dtype('float32'), 'concept_showcnt_6': np.dtype('int16'), 'concept_accrate_6': np.dtype('float32'), 'concept_type_showcnt_1': np.dtype('int32'), 'concept_type_accrate_1': np.dtype('float32'), 'concept_type_showcnt_2': np.dtype('int32'), 'concept_type_accrate_2': np.dtype('float32'), 'concept_type_showcnt_3': np.dtype('int16'), 'concept_type_accrate_3': np.dtype('float32'), 'concept_type_showcnt_4': np.dtype('int16'), 'concept_type_accrate_4': np.dtype('float32'), 'concept_type_showcnt_5': np.dtype('int16'), 'concept_type_accrate_5': np.dtype('float32'), 'concept_type_showcnt_6': np.dtype('int16'), 'concept_type_accrate_6': np.dtype('float32'), 'concept_min_accrate': np.dtype('float32'), 'concept_mean_accrate': np.dtype('float32'), 'concept_max_accrate': np.dtype('float32'), 'concept_min_showcnt': np.dtype('int32'), 'concept_mean_showcnt': np.dtype('float32'), 'concept_max_showcnt': np.dtype('int32'), 'concepttype_min_accrate': np.dtype('float32'), 'concepttype_mean_accrate': np.dtype('float32'), 'concepttype_max_accrate': np.dtype('float32'), 'concepttype_min_showcnt': np.dtype('int32'), 'concepttype_mean_showcnt': np.dtype('float32'), 'concepttype_max_showcnt': np.dtype('int32'), 'kc_showcnt_1': np.dtype('int32'), 'kc_accrate_1': np.dtype('float32'), 'kc_showcnt_2': np.dtype('int32'), 'kc_accrate_2': np.dtype('float32'), 'kc_showcnt_3': np.dtype('int32'), 'kc_accrate_3': np.dtype('float32'), 'kc_showcnt_4': np.dtype('int32'), 'kc_accrate_4': np.dtype('float32'), 'kc_showcnt_5': np.dtype('int32'), 'kc_accrate_5': np.dtype('float32'), 'kc_showcnt_6': np.dtype('int32'), 'kc_accrate_6': np.dtype('float32'), 'kc_showcnt_7': np.dtype('int32'), 'kc_accrate_7': np.dtype('float32'), 'kc_showcnt_8': np.dtype('int32'), 'kc_accrate_8': np.dtype('float32'), 'kc_type_showcnt_1': np.dtype('int32'), 'kc_type_accrate_1': np.dtype('float32'), 'kc_type_showcnt_2': np.dtype('int32'), 'kc_type_accrate_2': np.dtype('float32'), 'kc_type_showcnt_3': np.dtype('int32'), 'kc_type_accrate_3': np.dtype('float32'), 'kc_type_showcnt_4': np.dtype('int32'), 'kc_type_accrate_4': np.dtype('float32'), 'kc_type_showcnt_5': np.dtype('int32'), 'kc_type_accrate_5': np.dtype('float32'), 'kc_type_showcnt_6': np.dtype('int32'), 'kc_type_accrate_6': np.dtype('float32'), 'kc_type_showcnt_7': np.dtype('int32'), 'kc_type_accrate_7': np.dtype('float32'), 'kc_type_showcnt_8': np.dtype('int32'), 'kc_type_accrate_8': np.dtype('float32'), 'kc_min_accrate': np.dtype('float32'), 'kc_mean_accrate': np.dtype('float32'), 'kc_max_accrate': np.dtype('float32'), 'kc_min_showcnt': np.dtype('int32'), 'kc_mean_showcnt': np.dtype('float32'), 'kc_max_showcnt': np.dtype('int32'), 'kctype_min_accrate': np.dtype('float32'), 'kctype_mean_accrate': np.dtype('float32'), 'kctype_max_accrate': np.dtype('float32'), 'kctype_min_showcnt': np.dtype('int32'), 'kctype_mean_showcnt': np.dtype('float32'), 'kctype_max_showcnt': np.dtype('int32'), 'question_showcnt': np.dtype('int16'), 'question_accrate': np.dtype('float32'), 'type_showcnt': np.dtype('int32'), 'type_accrate': np.dtype('float32'), 'time_hour_showcnt': np.dtype('int32'), 'time_hour_accrate': np.dtype('float32'), 'time_is_workday_showcnt': np.dtype('int32'), 'time_is_workday_accrate': np.dtype('float32'), 'time_weekday_showcnt': np.dtype('int32'), 'time_weekday_accrate': np.dtype('float32'), 'timestamp_question_cnt': np.dtype('int16'), 'timestamp_question_accrate': np.dtype('float32'), 'question_showcnt_without_timestamp_question': np.dtype('int16'), 'question_accrate_without_timestamp_question': np.dtype('float32'), 'uid_question_utilnow_3600_asc': np.dtype('int8'), 'uid_question_submittimes_3600_asc': np.dtype('int16'), 'uid_unique_submittimes_3600_asc': np.dtype('int8'), 'uid_timedistance_to_last_submit_asc': np.dtype('int32'), 'uid_timestamp_question_submit': np.dtype('int8'), 'uid_question_utilnow_21600_asc': np.dtype('int8'), 'uid_question_submittimes_21600_asc': np.dtype('int16'), 'uid_unique_submittimes_21600_asc': np.dtype('int8'), 'uid_question_utilnow_94608000_asc': np.dtype('int8'), 'uid_question_submittimes_94608000_asc': np.dtype('int16'), 'uid_unique_submittimes_94608000_asc': np.dtype('int16'), 'uid_question_utilnow_94608000_desc': np.dtype('int8'), 'uid_question_submittimes_94608000_desc': np.dtype('int16'), 'uid_unique_submittimes_94608000_desc': np.dtype('int16'), 'uid_timedistance_to_last_submit_desc': np.dtype('int32'), 'uid_kc_utilnow_3600_asc': np.dtype('int16'), 'uid_kc_utilnow_21600_asc': np.dtype('int16'), 'uid_kc_utilnow_94608000_asc': np.dtype('int16'), 'uid_kc_utilnow_94608000_desc': np.dtype('int16'), 'new_kc_days_min': np.dtype('float32'), 'uid_new_kc_days_min': np.dtype('float32'), 'uid_concept_utilnow_3600_asc': np.dtype('int8'), 'uid_concept_utilnow_21600_asc': np.dtype('int8'), 'uid_concept_utilnow_94608000_asc': np.dtype('int8'), 'uid_concept_utilnow_94608000_desc': np.dtype('int8'), 'new_concept_days_min': np.dtype('float32'), 'uid_new_concept_days_min': np.dtype('float32'), 'uid_timestamp_question_meansubmit': np.dtype('float32'), 'uid_timestamp_showcnt': np.dtype('int8'), 'new_question_days': np.dtype('float32'), 'uid_new_question_days': np.dtype('float32'), 'real_response': np.dtype('int8'), 'is_enhance': np.dtype('int8'), 'uid_showcnt': np.dtype('int16'), 'uid_accrate': np.dtype('float32'), 'uid_question_accrate': np.dtype('float32'), 'uid_concept_accrate_1': np.dtype('float32'), 'uid_concept_accrate_2': np.dtype('float32'), 'uid_concept_accrate_3': np.dtype('float32'), 'uid_concept_accrate_4': np.dtype('float32'), 'uid_concept_accrate_5': np.dtype('float32'), 'uid_concept_accrate_6': np.dtype('float32'), 'uid_concept_type_accrate_1': np.dtype('float32'), 'uid_concept_type_accrate_2': np.dtype('float32'), 'uid_concept_type_accrate_3': np.dtype('float32'), 'uid_concept_type_accrate_4': np.dtype('float32'), 'uid_concept_type_accrate_5': np.dtype('float32'), 'uid_concept_type_accrate_6': np.dtype('int8'), 'uidconcept_min_accrate': np.dtype('float32'), 'uidconcept_mean_accrate': np.dtype('float32'), 'uidconcept_max_accrate': np.dtype('float32'), 'uidconcepttype_min_accrate': np.dtype('float32'), 'uidconcepttype_mean_accrate': np.dtype('float32'), 'uidconcepttype_max_accrate': np.dtype('float32'), 'uid_kc_accrate_1': np.dtype('float32'), 'uid_kc_accrate_2': np.dtype('float32'), 'uid_kc_accrate_3': np.dtype('float32'), 'uid_kc_accrate_4': np.dtype('float32'), 'uid_kc_accrate_5': np.dtype('float32'), 'uid_kc_accrate_6': np.dtype('float32'), 'uid_kc_accrate_7': np.dtype('float32'), 'uid_kc_accrate_8': np.dtype('float32'), 'uid_kc_type_accrate_1': np.dtype('float32'), 'uid_kc_type_accrate_2': np.dtype('float32'), 'uid_kc_type_accrate_3': np.dtype('float32'), 'uid_kc_type_accrate_4': np.dtype('float32'), 'uid_kc_type_accrate_5': np.dtype('float32'), 'uid_kc_type_accrate_6': np.dtype('float32'), 'uid_kc_type_accrate_7': np.dtype('float32'), 'uid_kc_type_accrate_8': np.dtype('float32'), 'uidkc_min_accrate': np.dtype('float32'), 'uidkc_mean_accrate': np.dtype('float32'), 'uidkc_max_accrate': np.dtype('float32'), 'uidkctype_min_accrate': np.dtype('float32'), 'uidkctype_mean_accrate': np.dtype('float32'), 'uidkctype_max_accrate': np.dtype('float32'), 'hours_to_half_split_timestamp': np.dtype('int32'), 'records_to_half_split_uid_record_cumsum': np.dtype('int16'), 'cnt_finishsimiliar': np.dtype('float32'), 'acctop5_finishsimiliar': np.dtype('float32'), 'acctop1_finishsimiliar': np.dtype('float32'), 'cnt_accsimiliar': np.dtype('float32'), 'acctop5_accsimiliar': np.dtype('float32'), 'acctop1_accsimiliar': np.dtype('float32'), 'cnt_finishstrictsimiliar': np.dtype('float32'), 'acctop5_finishstrictsimiliar': np.dtype('float32'), 'acctop1_finishstrictsimiliar': np.dtype('float32'), 'cnt_accstrictsimiliar': np.dtype('float32'), 'acctop5_accstrictsimiliar': np.dtype('float32'), 'acctop1_accstrictsimiliar': np.dtype('float32'), 'accrate_similiar': np.dtype('float32'), 'accrate_similiar_strict': np.dtype('float32'), 'acctop5max_similiar': np.dtype('float32'), 'acctop1max_similiar': np.dtype('float32'), 'acctop5max_similiar_strict': np.dtype('float32'), 'acctop1max_similiar_strict': np.dtype('float32')}

useful_features = ['uid_concept_utilnow_94608000_desc', 'time_hour_accrate', 'concepttype_mean_showcnt', 'time_weekday', 
'uid_timestamp_showcnt', 'concept_min_accrate', 'uid_accrate', 'question_accrate', 'kc_showcnt_4', 
'question_showcnt', 'uid_concept_utilnow_3600_asc', 'concept_max_accrate', 'kc_type_showcnt_1', 'kc_3', 
'concept_type_accrate_1', 'uid_showcnt', 'time_day', 'question', 'timestamp_question_cnt', 'concept_mean_accrate', 
'uid_concept_utilnow_21600_asc', 'uid_concept_accrate_4', 'kc_2', 'uidconcepttype_max_accrate', 'type_accrate', 
'analysis_cnt', 'kc_max_accrate', 'concept_min_showcnt', 'uidkc_mean_accrate'] + \
    ['uid_accrate', 'question_accrate', 'timestamp_question_cnt', 'question', 'timestamp', 'time_day', 
 'uid_new_question_days', 'uid_kc_utilnow_3600_asc', 'uid_timestamp_showcnt', 'uid_concept_utilnow_21600_asc', 
 'analysis_cnt', 'new_concept_days_min', 'concepttype_min_accrate', 'concept_mean_accrate', 'question_showcnt', 
 'time_hour_accrate', 'uidkctype_min_accrate', 'uid_concept_accrate_5'] + \
    ['uid','question','timestamp' ,'response','real_response','is_enhance'] + \
    ['cnt_finishsimiliar', 'acctop5_finishsimiliar',
       'acctop1_finishsimiliar', 'cnt_accsimiliar', 'acctop5_accsimiliar',
       'acctop1_accsimiliar', 'cnt_finishstrictsimiliar',
       'acctop5_finishstrictsimiliar', 'acctop1_finishstrictsimiliar',
       'cnt_accstrictsimiliar', 'acctop5_accstrictsimiliar',
       'acctop1_accstrictsimiliar', 'accrate_similiar',
       'accrate_similiar_strict', 'acctop5max_similiar',
       'acctop1max_similiar', 'acctop5max_similiar_strict',
       'acctop1max_similiar_strict']
useful_features = list(set(useful_features))

def feature_engineering2(df_tr, df_test, df_val=None, tr_data_enhance=1, val_data_enhance=1, seed=1, save_files=None):
    # 不同统计方式得到的特征，用于不同类型的模型
    columns = {
        'at':[], # 可以直接使用的特征
        'bs':[], # 统计这次提交以前的label
        'bt':[], # 统计今天以前的label
        'bh':[]  # 只使用一半的label进行统计
    }
    gc.collect()
    
    # 开始统计每个人的个人信息（数据增强）,append_columns:是否把列名加入columns字典
    def calc_uid_features(df, append_columns=False, data_type='tr'):
        
        df_response = df[df['response']!=-1]
        
        if data_type in ('tr','val'):
            col = 'uid'
            df = df.drop([col+'_showcnt',col+'_accrate'], axis=1)
            tmp = df_response.groupby(col)['response'].agg(['count','mean']).rename(columns={'count':col+'_showcnt', 'mean':col+'_accrate'})
            df=df.merge(tmp, how='left', left_on=col, right_index=True, suffixes=(None,'_1'))
            columns['bh'].extend([col+'_showcnt',col+'_accrate'])
            
                
        #""" 不work
        # concept : df_response 的累计正确率
        col = 'uid'
        df_tmp = pd.concat([
            df_response.loc[df_response['concept_1']!=-1,['uid','response','concept_1','type']].rename(columns={'concept_1':'concept'}),
            df_response.loc[df_response['concept_2']!=-1,['uid','response','concept_2','type']].rename(columns={'concept_2':'concept'}),
            df_response.loc[df_response['concept_3']!=-1,['uid','response','concept_3','type']].rename(columns={'concept_3':'concept'}),
            df_response.loc[df_response['concept_4']!=-1,['uid','response','concept_4','type']].rename(columns={'concept_4':'concept'}),
            df_response.loc[df_response['concept_5']!=-1,['uid','response','concept_5','type']].rename(columns={'concept_5':'concept'}),
            df_response.loc[df_response['concept_6']!=-1,['uid','response','concept_6','type']].rename(columns={'concept_6':'concept'}),
        ])
        col2 = 'concept'
        tmp = df_tmp.groupby([col, col2])['response'].agg('mean').rename(col+'_'+col2+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_4'], right_index=True, suffixes=(None,'_4')).\
                merge(tmp, how='left', left_on=[col,col2+'_5'], right_index=True, suffixes=(None,'_5')).\
                rename(columns={col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_4'}).fillna(-1)
        col3 = 'type'  # {'count':len,'mean':np.mean}
        tmp = df_tmp.groupby([col,col2,col3])['response'].agg('mean').rename(col+'_'+col2+'_'+col3+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_1',col3], right_index=True, suffixes=(None,'_1'))\
            .merge(tmp, how='left', left_on=[col,col2+'_2',col3], right_index=True, suffixes=(None,'_2'))\
            .merge(tmp, how='left', left_on=[col,col2+'_3',col3], right_index=True, suffixes=(None,'_3'))\
            .merge(tmp, how='left', left_on=[col,col2+'_4',col3], right_index=True, suffixes=(None,'_4'))\
            .merge(tmp, how='left', left_on=[col,col2+'_5',col3], right_index=True, suffixes=(None,'_5'))\
            .merge(tmp, how='left', left_on=[col,col2+'_6',col3], right_index=True, suffixes=(None,'_6'))\
            .rename(columns={col+'_'+col2+'_'+col3+'_accrate':col+'_'+col2+'_'+col3+'_accrate'+'_1'}).fillna(-1)
        for i in range(1,7):
            columns['bh'].extend([col+'_'+col2+'_'+col3+'_accrate_'+str(i),col+'_'+col2+'_accrate_'+str(i)])
        df[col+col2+col3+'_max_accrate'] = df[['uid_concept_type_accrate_1','uid_concept_type_accrate_2','uid_concept_type_accrate_3','uid_concept_type_accrate_4','uid_concept_type_accrate_5','uid_concept_type_accrate_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
        columns['at'].extend([col+col2+col3+'_max_accrate'])
        
    
        # kc : df_response 的累计正确率
        df_tmp = pd.concat([
            df_response.loc[df_response['kc_1']!=-1,['uid','response','kc_1','timestamp','type']].rename(columns={'kc_1':'kc'}),
            df_response.loc[df_response['kc_2']!=-1,['uid','response','kc_2','timestamp','type']].rename(columns={'kc_2':'kc'}),
            df_response.loc[df_response['kc_3']!=-1,['uid','response','kc_3','timestamp','type']].rename(columns={'kc_3':'kc'}),
            df_response.loc[df_response['kc_4']!=-1,['uid','response','kc_4','timestamp','type']].rename(columns={'kc_4':'kc'}),
            df_response.loc[df_response['kc_5']!=-1,['uid','response','kc_5','timestamp','type']].rename(columns={'kc_5':'kc'}),
            df_response.loc[df_response['kc_6']!=-1,['uid','response','kc_6','timestamp','type']].rename(columns={'kc_6':'kc'}),
            df_response.loc[df_response['kc_7']!=-1,['uid','response','kc_7','timestamp','type']].rename(columns={'kc_7':'kc'}),
            df_response.loc[df_response['kc_8']!=-1,['uid','response','kc_8','timestamp','type']].rename(columns={'kc_8':'kc'}),
        ])
        gc.collect()
        col2 = 'kc'
        tmp = df_tmp.groupby([col, col2])['response'].agg('mean').rename(col+'_'+col2+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_1'], right_index=True, suffixes=(None,'_1'))\
            .merge(tmp, how='left', left_on=[col,col2+'_2'], right_index=True, suffixes=(None,'_2'))\
            .merge(tmp, how='left', left_on=[col,col2+'_3'], right_index=True, suffixes=(None,'_3'))\
            .merge(tmp, how='left', left_on=[col,col2+'_4'], right_index=True, suffixes=(None,'_4'))\
            .merge(tmp, how='left', left_on=[col,col2+'_5'], right_index=True, suffixes=(None,'_5'))\
            .merge(tmp, how='left', left_on=[col,col2+'_6'], right_index=True, suffixes=(None,'_6'))\
            .merge(tmp, how='left', left_on=[col,col2+'_7'], right_index=True, suffixes=(None,'_7'))\
            .merge(tmp, how='left', left_on=[col,col2+'_8'], right_index=True, suffixes=(None,'_8'))\
            .rename(columns={col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_1'})
        for i in range(1,9):
            columns['bh'].extend([col+'_'+col2+'_'+col3+'_accrate_'+str(i), col+'_'+col2+'_accrate_'+str(i)])
        df[col+col2+'_mean_accrate'] = df[['uid_kc_accrate_1','uid_kc_accrate_2','uid_kc_accrate_3','uid_kc_accrate_4','uid_kc_accrate_5','uid_kc_accrate_6','uid_kc_accrate_7','uid_kc_accrate_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
        columns['at'].extend([col+col2+'_min_accrate',col+col2+'_mean_accrate',col+col2+'_max_accrate'])
        
        del df_tmp
        gc.collect()
        
        # 找到相似题目，映射到原特征中，计算相似题目做过/会做多少；top1、top3 accrate
        def tmp_func(df, timelimit_value=3600000, columns_need_set=None):
            if timelimit_value<0:
                idx_raw = df.index
                df = df.iloc[::-1].reset_index(drop=True)
            timelimit = str(timelimit_value)
            from collections import defaultdict
            global question_interact_ok,question_interact,question_interact_true_ok,question_interact_true
            global question_similiar_reverse,question_true_similiar_reverse,question_similiar_reverse_strict,question_true_similiar_reverse_strict
            global question_info_sim,question_info_sim_strict
            finished_questions = defaultdict(int)
            true_questions = defaultdict(int)
            start = -1
            res = {}
            # 注册要使用的列名
            res['accrate_similiar_strict'] = []
            for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                res['cnt_'+similiar_name] = []
                res['acctop5_'+similiar_name] = []
                res['acctop1_'+similiar_name] = []
                for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                    res['cnt_'+similiar_name+'_'+contentsim_name] = []
                    res['acctop5_'+similiar_name+'_'+contentsim_name] = []
                    res['acctop1_'+similiar_name+'_'+contentsim_name] = []

            for idx, (question, response, timestamp) in df[['question', 'response', 'timestamp']].iterrows():
                # 超时的数据排除出去
                if start == -1:
                    start = 0 if timelimit_value>0 else len(df)-1
                else:
                    while (timelimit_value>0 and timestamp - df['timestamp'].iloc[start] > timelimit_value) or (timelimit_value<0 and df['timestamp'].iloc[start]-timestamp > -timelimit_value):
                        if df['response'].iloc[start] == 1:
                            true_questions[df['question'].iloc[start]] -= 1
                        finished_questions[df['question'].iloc[start]] -= 1
                        start += (1 if timelimit_value>0 else -1)

                cols_here = ['cnt_finishsimiliar','acctop5_finishsimiliar','acctop1_finishsimiliar','acctop1max_similiar','acctop5max_similiar_strict','accrate_similiar','acctop5max_similiar']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_similiar_list = sorted([acc for i,acc in question_similiar_reverse[question].items() for _ in range(finished_questions[i])], reverse=True)
                    res['cnt_finishsimiliar'].append(len(question_similiar_list))
                    res['acctop5_finishsimiliar'].append(sum(question_similiar_list[:5])/(len(question_similiar_list[:5])+0.000001))
                    res['acctop1_finishsimiliar'].append(sum(question_similiar_list[:1])/(len(question_similiar_list[:1])+0.000001))
                
                cols_here = ['cnt_accsimiliar','acctop5_accsimiliar','acctop1_accsimiliar','accrate_similiar','acctop5max_similiar','acctop1max_similiar','acctop5max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_true_similiar_list = sorted([acc for i,acc in question_true_similiar_reverse[question].items() for _ in range(true_questions[i])], reverse=True)
                    res['cnt_accsimiliar'].append(len(question_true_similiar_list))
                    res['acctop5_accsimiliar'].append(sum(question_true_similiar_list[:5])/(len(question_true_similiar_list[:5])+0.000001))
                    res['acctop1_accsimiliar'].append(sum(question_true_similiar_list[:1])/(len(question_true_similiar_list[:1])+0.000001))
                
                cols_here = ['cnt_finishstrictsimiliar','acctop5_finishstrictsimiliar','acctop1_finishstrictsimiliar',
                             'accrate_similiar_strict','acctop1max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_similiar_strict_list = sorted([acc for i,acc in question_similiar_reverse_strict[question].items() for _ in range(finished_questions[i])], reverse=True)
                    res['cnt_finishstrictsimiliar'].append(len(question_similiar_strict_list))
                    res['acctop5_finishstrictsimiliar'].append(sum(question_similiar_strict_list[:5])/(len(question_similiar_strict_list[:5])+0.000001))
                    res['acctop1_finishstrictsimiliar'].append(sum(question_similiar_strict_list[:1])/(len(question_similiar_strict_list[:1])+0.000001))
                
                cols_here = ['cnt_accstrictsimiliar','acctop5_accstrictsimiliar','acctop1_accstrictsimiliar',
                             'accrate_similiar_strict','acctop1max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_true_similiar_strict_list = sorted([acc for i,acc in question_true_similiar_reverse_strict[question].items() for _ in range(true_questions[i])], reverse=True)
                    res['cnt_accstrictsimiliar'].append(len(question_true_similiar_strict_list))
                    res['acctop5_accstrictsimiliar'].append(sum(question_true_similiar_strict_list[:5])/(len(question_true_similiar_strict_list[:5])+0.000001))
                    res['acctop1_accstrictsimiliar'].append(sum(question_true_similiar_strict_list[:1])/(len(question_true_similiar_strict_list[:1])+0.000001))

                for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                    cols_here = ['cnt_finishsimiliar'+'_'+contentsim_name,'acctop5_finishsimiliar'+'_'+contentsim_name,'acctop1_finishsimiliar'+'_'+contentsim_name,
                                 'acctop1max_similiar'+'_'+contentsim_name,'acctop5max_similiar_strict'+'_'+contentsim_name,'accrate_similiar'+'_'+contentsim_name,'acctop5max_similiar'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_similiar_list = sorted([acc for i,acc in question_similiar_reverse[question].items() if i in contentsim[question] for _ in range(finished_questions[i])], reverse=True)
                        res['cnt_finishsimiliar'+'_'+contentsim_name].append(len(question_similiar_list))
                        res['acctop5_finishsimiliar'+'_'+contentsim_name].append(sum(question_similiar_list[:5])/(len(question_similiar_list[:5])+0.000001))
                        res['acctop1_finishsimiliar'+'_'+contentsim_name].append(sum(question_similiar_list[:1])/(len(question_similiar_list[:1])+0.000001))

                    cols_here = ['cnt_accsimiliar'+'_'+contentsim_name,'acctop5_accsimiliar'+'_'+contentsim_name,'acctop1_accsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar'+'_'+contentsim_name,'acctop5max_similiar'+'_'+contentsim_name,'acctop1max_similiar'+'_'+contentsim_name,'acctop5max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_true_similiar_list = sorted([acc for i,acc in question_true_similiar_reverse[question].items() if i in contentsim[question] for _ in range(true_questions[i])], reverse=True)
                        res['cnt_accsimiliar'+'_'+contentsim_name].append(len(question_true_similiar_list))
                        res['acctop5_accsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_list[:5])/(len(question_true_similiar_list[:5])+0.000001))
                        res['acctop1_accsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_list[:1])/(len(question_true_similiar_list[:1])+0.000001))

                    cols_here = ['cnt_finishstrictsimiliar'+'_'+contentsim_name,'acctop5_finishstrictsimiliar'+'_'+contentsim_name,'acctop1_finishstrictsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar_strict'+'_'+contentsim_name,'acctop1max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_similiar_strict_list = sorted([acc for i,acc in question_similiar_reverse_strict[question].items() if i in contentsim[question] for _ in range(finished_questions[i])], reverse=True)
                        res['cnt_finishstrictsimiliar'+'_'+contentsim_name].append(len(question_similiar_strict_list))
                        res['acctop5_finishstrictsimiliar'+'_'+contentsim_name].append(sum(question_similiar_strict_list[:5])/(len(question_similiar_strict_list[:5])+0.000001))
                        res['acctop1_finishstrictsimiliar'+'_'+contentsim_name].append(sum(question_similiar_strict_list[:1])/(len(question_similiar_strict_list[:1])+0.000001))

                    cols_here = ['cnt_accstrictsimiliar'+'_'+contentsim_name,'acctop5_accstrictsimiliar'+'_'+contentsim_name,'acctop1_accstrictsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar_strict'+'_'+contentsim_name,'acctop1max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_true_similiar_strict_list = sorted([acc for i,acc in question_true_similiar_reverse_strict[question].items() if i in contentsim[question] for _ in range(true_questions[i])], reverse=True)
                        res['cnt_accstrictsimiliar'+'_'+contentsim_name].append(len(question_true_similiar_strict_list))
                        res['acctop5_accstrictsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_strict_list[:5])/(len(question_true_similiar_strict_list[:5])+0.000001))
                        res['acctop1_accstrictsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_strict_list[:1])/(len(question_true_similiar_strict_list[:1])+0.000001))
                    
                if response == 1:
                    true_questions[question] += 1
                finished_questions[question] += 1

            for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                if 'cnt_'+similiar_name in columns_need_set or \
                    ('accrate_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('accrate_similiar_strict' in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                    df['cnt_'+similiar_name+'_'+timelimit] = res['cnt_'+similiar_name]
                if 'acctop5_'+similiar_name in columns_need_set or \
                    ('acctop5max_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')): 
                    df['acctop5_'+similiar_name+'_'+timelimit] = res['acctop5_'+similiar_name]
                if 'acctop1_'+similiar_name in columns_need_set or \
                    ('acctop1max_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('acctop5max_similiar_strict' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('acctop1max_similiar_strict' in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                    df['acctop1_'+similiar_name+'_'+timelimit] = res['acctop1_'+similiar_name]
            if 'accrate_similiar' in columns_need_set: df['accrate_similiar'+'_'+timelimit] = df['cnt_accsimiliar'+'_'+timelimit]/(df['cnt_finishsimiliar'+'_'+timelimit]+0.000001)    
            if 'accrate_similiar_strict' in columns_need_set: df['accrate_similiar_strict'+'_'+timelimit] = df['cnt_accstrictsimiliar'+'_'+timelimit]/(df['cnt_finishstrictsimiliar'+'_'+timelimit]+0.000001)
            if 'acctop5max_similiar' in columns_need_set: df['acctop5max_similiar'+'_'+timelimit] = df[['acctop5_finishsimiliar'+'_'+timelimit,'acctop5_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop1max_similiar' in columns_need_set: df['acctop1max_similiar'+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+timelimit,'acctop1_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop5max_similiar_strict' in columns_need_set: df['acctop5max_similiar_strict'+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+timelimit,'acctop1_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop1max_similiar_strict' in columns_need_set: df['acctop1max_similiar_strict'+'_'+timelimit] = df[['acctop1_finishstrictsimiliar'+'_'+timelimit,'acctop1_accstrictsimiliar'+'_'+timelimit]].max(axis=1)

            for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                    if 'cnt_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('accrate_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('accrate_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                        df['cnt_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['cnt_'+similiar_name+'_'+contentsim_name]
                    if 'acctop5_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('acctop5max_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')): 
                        df['acctop5_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['acctop5_'+similiar_name+'_'+contentsim_name]
                    if 'acctop1_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('acctop1max_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                        df['acctop1_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['acctop1_'+similiar_name+'_'+contentsim_name]
                if 'accrate_similiar'+'_'+contentsim_name in columns_need_set: df['accrate_similiar'+'_'+contentsim_name+'_'+timelimit] = df['cnt_accsimiliar'+'_'+contentsim_name+'_'+timelimit]/(df['cnt_finishsimiliar'+'_'+contentsim_name+'_'+timelimit]+0.000001)    
                if 'accrate_similiar_strict'+'_'+contentsim_name in columns_need_set: df['accrate_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df['cnt_accstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]/(df['cnt_finishstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]+0.000001)
                if 'acctop5max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar'+'_'+contentsim_name+'_'+timelimit] = df[['acctop5_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop5_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop1max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishstrictsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)
            if timelimit_value<0:
                df = df.iloc[::-1].set_index(idx_raw)
            for contentsim_name in ('_contentsim','_contentsimstrict',''):
                for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                    if 'cnt_'+similiar_name+'_'+contentsim_name in columns_need_set: df['cnt_'+similiar_name+contentsim_name+'_'+timelimit] = df['cnt_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                    if 'acctop5_'+similiar_name+'_'+contentsim_name in columns_need_set: df['acctop5_'+similiar_name+contentsim_name+'_'+timelimit] = df['acctop5_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                    if 'acctop1_'+similiar_name+'_'+contentsim_name in columns_need_set: df['acctop1_'+similiar_name+contentsim_name+'_'+timelimit] = df['acctop1_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                if 'accrate_similiar'+'_'+contentsim_name in columns_need_set: df['accrate_similiar'+contentsim_name+'_'+timelimit] = df['accrate_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'accrate_similiar_strict'+'_'+contentsim_name in columns_need_set: df['accrate_similiar_strict'+contentsim_name+'_'+timelimit] = df['accrate_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop5max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar'+contentsim_name+'_'+timelimit] = df['acctop5max_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop1max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar'+contentsim_name+'_'+timelimit] = df['acctop1max_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar_strict'+contentsim_name+'_'+timelimit] = df['acctop5max_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar_strict'+contentsim_name+'_'+timelimit] = df['acctop1max_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                    
            return df

        
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '31536000000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=365*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '1209600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=14*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '86400000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '3600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-1209600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=-14*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-21600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=-6*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        
        # 训练、验证集：只需要保存需要预测的就行了
        if data_type in ['tr','val']:
        #if data_type in ['val']:
            df = df[df['response']==-1]

        df.fillna(-1, inplace=True)
        
        # 压缩数据，压缩为指定格式
        # df = df[useful_features]
        df=df[list(set(kf_or+['uid','timestamp','question','real_response','response','is_enhance','uid_record_cumsum','uid_record_sum']))]
        for col,dtype in df.dtypes.items():
            if col in useful_features_dtypes:
                df[col] = df[col].astype(useful_features_dtypes[col])
            elif dtype == 'int64':
                if col not in ['timestamp','is_enhance']:
                    if df[col].max()<=32767 and df[col].min()>=-32768:
                        df[col] = df[col].astype('int16')
                    else:
                        df[col] = df[col].astype('int32')
            elif dtype == 'float64':
                df[col] = df[col].astype('float32')
        
        return df
    
    df_tr['real_response'] = df_tr['response']
    df_tr['is_enhance'] = 0
    df_test['real_response'] = df_test['response']
    df_test['is_enhance'] = 0
    
    # 计算sim矩阵
    df_tr_tmp = df_tr.copy()
    df_tr_tmp.loc[(df_tr_tmp['uid_record_cumsum']>=df_tr_tmp['uid_record_sum']*0.5)&(df_tr_tmp['uid_record_cumsum']>=90), 'response'] = -1
    df_val_tmp = df_val.copy()
    df_val_tmp.loc[(df_val_tmp['uid_record_cumsum']>=df_val_tmp['uid_record_sum']*0.5)&(df_val_tmp['uid_record_cumsum']>=90), 'response'] = -1
    get_sim_metrix(df_tr_tmp, df_test, df_val=df_val_tmp)
    del df_tr_tmp,df_val_tmp
    gc.collect()
    
    df_test = calc_uid_features(df_test, append_columns=True, data_type='test')
    if save_files is not None:
        df_test.reset_index(drop=True).to_feather(save_files[2])
    gc.collect()

    data_type, data_enhance = ('val',val_data_enhance)
    if df_val is not None and data_enhance>0:
        df_val['real_response'] = df_val['response']
        df_val['is_enhance'] = 0
        dfs = []
        rng = np.random.default_rng(seed)
        for _ in range(0, data_enhance):
            print('val enhance', _)
            # 只选取范围内（尝试截断，比如k不能大于3/4）
            df_tmp = df_val.copy()
            k = rng.random()*0.5+0.25 if _ != 0 else 0.5
            df_tmp.loc[(df_tmp['uid_record_cumsum']>=df_tmp['uid_record_sum']*k)&(df_tmp['uid_record_cumsum']>=90), 'response'] = -1
            # 超过k*2范围的数据就不要了
            tmp = calc_uid_features(df_tmp[(df_tmp['uid_record_cumsum']<=df_tmp['uid_record_sum']*k*2)], data_type=data_type)
            tmp['is_enhance'] = _
            dfs.append(tmp)
            gc.collect()
        df_val = pd.concat(dfs, axis=0)
        if save_files is not None:
            df_val.reset_index(drop=True).to_feather(save_files[1])
        gc.collect()
    
    data_type, data_enhance = ('tr',tr_data_enhance)
    if data_enhance>0:
        dfs = []
        rng = np.random.default_rng(seed)
        for _ in range(0, data_enhance):
            print('tr enhance', _)
            # 只选取范围内（尝试截断，比如k不能大于3/4）
            df_tmp = df_tr.copy()
            k = rng.random()*0.5+0.25 if _ != 0 else 0.5
            df_tmp.loc[(df_tmp['uid_record_cumsum']>=df_tmp['uid_record_sum']*k)&(df_tmp['uid_record_cumsum']>=90), 'response'] = -1
            # 更新sim矩阵
            get_sim_metrix(df_tmp, df_test, df_val=df_val)
            # 超过k*2范围的数据就不要了
            tmp = calc_uid_features(df_tmp[(df_tmp['uid_record_cumsum']<=df_tmp['uid_record_sum']*k*2)], data_type=data_type)
            tmp = tmp.copy()
            tmp.reset_index(drop=True, inplace=True)
            tmp['is_enhance'] = _
            dfs.append(tmp)
            gc.collect()
        print('merging df_tr')
        df_tr = pd.concat(dfs, axis=0)
        del dfs
        if save_files is not None:
            print('resetindex df_tr')
            df_tr.reset_index(drop=True, inplace=True)
            print('tofeather df_tr')
            df_tr.to_feather(save_files[0])
        gc.collect()
    gc.collect()
    
    if save_files is not None:
        return 
    if df_val is not None:
        return df_tr, df_val, df_test, columns
    else:
        return df_tr, df_test, columns

"""
df_tmp,df_tmp2, df_tmp3, columns_tmp = feature_engineering2(df.iloc[:3000].copy(), df_test.iloc[:3000].copy(), df_val=df.iloc[3000:6000].copy(), 
                                         tr_data_enhance=2, val_data_enhance=1)
print(columns_tmp)
df_tmp
"""

'\ndf_tmp,df_tmp2, df_tmp3, columns_tmp = feature_engineering2(df.iloc[:3000].copy(), df_test.iloc[:3000].copy(), df_val=df.iloc[3000:6000].copy(), \n                                         tr_data_enhance=2, val_data_enhance=1)\nprint(columns_tmp)\ndf_tmp\n'

In [8]:
# 将测试集的已有信息也放进来（特征工程和训练集）
useful_features_dtypes = {'uid': np.dtype('int16'), 'question': np.dtype('int16'), 'response': np.dtype('int8'), 'timestamp': np.dtype('int64'), 'type': np.dtype('int8'), 'concept_cnt': np.dtype('int8'), 'concept_hot_cnt': np.dtype('int8'), 'concept_1': np.dtype('int16'), 'concept_2': np.dtype('int16'), 'concept_3': np.dtype('int16'), 'concept_4': np.dtype('int16'), 'concept_5': np.dtype('int16'), 'concept_6': np.dtype('int16'), 'content_cnt': np.dtype('int16'), 'kc_group_cnt': np.dtype('int8'), 'kc_cnt': np.dtype('int8'), 'kc_1': np.dtype('int16'), 'kc_2': np.dtype('int16'), 'kc_3': np.dtype('int16'), 'kc_4': np.dtype('int16'), 'kc_5': np.dtype('int16'), 'kc_6': np.dtype('int16'), 'kc_7': np.dtype('int16'), 'kc_8': np.dtype('int8'), 'analysis_cnt': np.dtype('int16'), 'time_day': np.dtype('int16'), 'time_hour': np.dtype('int8'), 'time_is_workday': np.dtype('int8'), 'time_weekday': np.dtype('int8'), 'time_year': np.dtype('int16'), 'data_type': np.dtype('O'), 'uid_record_cumsum': np.dtype('int16'), 'uid_record_sum': np.dtype('int16'), 'concept_showcnt_1': np.dtype('int32'), 'concept_accrate_1': np.dtype('float32'), 'concept_showcnt_2': np.dtype('int32'), 'concept_accrate_2': np.dtype('float32'), 'concept_showcnt_3': np.dtype('int32'), 'concept_accrate_3': np.dtype('float32'), 'concept_showcnt_4': np.dtype('int16'), 'concept_accrate_4': np.dtype('float32'), 'concept_showcnt_5': np.dtype('int16'), 'concept_accrate_5': np.dtype('float32'), 'concept_showcnt_6': np.dtype('int16'), 'concept_accrate_6': np.dtype('float32'), 'concept_type_showcnt_1': np.dtype('int32'), 'concept_type_accrate_1': np.dtype('float32'), 'concept_type_showcnt_2': np.dtype('int32'), 'concept_type_accrate_2': np.dtype('float32'), 'concept_type_showcnt_3': np.dtype('int16'), 'concept_type_accrate_3': np.dtype('float32'), 'concept_type_showcnt_4': np.dtype('int16'), 'concept_type_accrate_4': np.dtype('float32'), 'concept_type_showcnt_5': np.dtype('int16'), 'concept_type_accrate_5': np.dtype('float32'), 'concept_type_showcnt_6': np.dtype('int16'), 'concept_type_accrate_6': np.dtype('float32'), 'concept_min_accrate': np.dtype('float32'), 'concept_mean_accrate': np.dtype('float32'), 'concept_max_accrate': np.dtype('float32'), 'concept_min_showcnt': np.dtype('int32'), 'concept_mean_showcnt': np.dtype('float32'), 'concept_max_showcnt': np.dtype('int32'), 'concepttype_min_accrate': np.dtype('float32'), 'concepttype_mean_accrate': np.dtype('float32'), 'concepttype_max_accrate': np.dtype('float32'), 'concepttype_min_showcnt': np.dtype('int32'), 'concepttype_mean_showcnt': np.dtype('float32'), 'concepttype_max_showcnt': np.dtype('int32'), 'kc_showcnt_1': np.dtype('int32'), 'kc_accrate_1': np.dtype('float32'), 'kc_showcnt_2': np.dtype('int32'), 'kc_accrate_2': np.dtype('float32'), 'kc_showcnt_3': np.dtype('int32'), 'kc_accrate_3': np.dtype('float32'), 'kc_showcnt_4': np.dtype('int32'), 'kc_accrate_4': np.dtype('float32'), 'kc_showcnt_5': np.dtype('int32'), 'kc_accrate_5': np.dtype('float32'), 'kc_showcnt_6': np.dtype('int32'), 'kc_accrate_6': np.dtype('float32'), 'kc_showcnt_7': np.dtype('int32'), 'kc_accrate_7': np.dtype('float32'), 'kc_showcnt_8': np.dtype('int32'), 'kc_accrate_8': np.dtype('float32'), 'kc_type_showcnt_1': np.dtype('int32'), 'kc_type_accrate_1': np.dtype('float32'), 'kc_type_showcnt_2': np.dtype('int32'), 'kc_type_accrate_2': np.dtype('float32'), 'kc_type_showcnt_3': np.dtype('int32'), 'kc_type_accrate_3': np.dtype('float32'), 'kc_type_showcnt_4': np.dtype('int32'), 'kc_type_accrate_4': np.dtype('float32'), 'kc_type_showcnt_5': np.dtype('int32'), 'kc_type_accrate_5': np.dtype('float32'), 'kc_type_showcnt_6': np.dtype('int32'), 'kc_type_accrate_6': np.dtype('float32'), 'kc_type_showcnt_7': np.dtype('int32'), 'kc_type_accrate_7': np.dtype('float32'), 'kc_type_showcnt_8': np.dtype('int32'), 'kc_type_accrate_8': np.dtype('float32'), 'kc_min_accrate': np.dtype('float32'), 'kc_mean_accrate': np.dtype('float32'), 'kc_max_accrate': np.dtype('float32'), 'kc_min_showcnt': np.dtype('int32'), 'kc_mean_showcnt': np.dtype('float32'), 'kc_max_showcnt': np.dtype('int32'), 'kctype_min_accrate': np.dtype('float32'), 'kctype_mean_accrate': np.dtype('float32'), 'kctype_max_accrate': np.dtype('float32'), 'kctype_min_showcnt': np.dtype('int32'), 'kctype_mean_showcnt': np.dtype('float32'), 'kctype_max_showcnt': np.dtype('int32'), 'question_showcnt': np.dtype('int16'), 'question_accrate': np.dtype('float32'), 'type_showcnt': np.dtype('int32'), 'type_accrate': np.dtype('float32'), 'time_hour_showcnt': np.dtype('int32'), 'time_hour_accrate': np.dtype('float32'), 'time_is_workday_showcnt': np.dtype('int32'), 'time_is_workday_accrate': np.dtype('float32'), 'time_weekday_showcnt': np.dtype('int32'), 'time_weekday_accrate': np.dtype('float32'), 'timestamp_question_cnt': np.dtype('int16'), 'timestamp_question_accrate': np.dtype('float32'), 'question_showcnt_without_timestamp_question': np.dtype('int16'), 'question_accrate_without_timestamp_question': np.dtype('float32'), 'uid_question_utilnow_3600_asc': np.dtype('int8'), 'uid_question_submittimes_3600_asc': np.dtype('int16'), 'uid_unique_submittimes_3600_asc': np.dtype('int8'), 'uid_timedistance_to_last_submit_asc': np.dtype('int32'), 'uid_timestamp_question_submit': np.dtype('int8'), 'uid_question_utilnow_21600_asc': np.dtype('int8'), 'uid_question_submittimes_21600_asc': np.dtype('int16'), 'uid_unique_submittimes_21600_asc': np.dtype('int8'), 'uid_question_utilnow_94608000_asc': np.dtype('int8'), 'uid_question_submittimes_94608000_asc': np.dtype('int16'), 'uid_unique_submittimes_94608000_asc': np.dtype('int16'), 'uid_question_utilnow_94608000_desc': np.dtype('int8'), 'uid_question_submittimes_94608000_desc': np.dtype('int16'), 'uid_unique_submittimes_94608000_desc': np.dtype('int16'), 'uid_timedistance_to_last_submit_desc': np.dtype('int32'), 'uid_kc_utilnow_3600_asc': np.dtype('int16'), 'uid_kc_utilnow_21600_asc': np.dtype('int16'), 'uid_kc_utilnow_94608000_asc': np.dtype('int16'), 'uid_kc_utilnow_94608000_desc': np.dtype('int16'), 'new_kc_days_min': np.dtype('float32'), 'uid_new_kc_days_min': np.dtype('float32'), 'uid_concept_utilnow_3600_asc': np.dtype('int8'), 'uid_concept_utilnow_21600_asc': np.dtype('int8'), 'uid_concept_utilnow_94608000_asc': np.dtype('int8'), 'uid_concept_utilnow_94608000_desc': np.dtype('int8'), 'new_concept_days_min': np.dtype('float32'), 'uid_new_concept_days_min': np.dtype('float32'), 'uid_timestamp_question_meansubmit': np.dtype('float32'), 'uid_timestamp_showcnt': np.dtype('int8'), 'new_question_days': np.dtype('float32'), 'uid_new_question_days': np.dtype('float32'), 'real_response': np.dtype('int8'), 'is_enhance': np.dtype('int8'), 'uid_showcnt': np.dtype('int16'), 'uid_accrate': np.dtype('float32'), 'uid_question_accrate': np.dtype('float32'), 'uid_concept_accrate_1': np.dtype('float32'), 'uid_concept_accrate_2': np.dtype('float32'), 'uid_concept_accrate_3': np.dtype('float32'), 'uid_concept_accrate_4': np.dtype('float32'), 'uid_concept_accrate_5': np.dtype('float32'), 'uid_concept_accrate_6': np.dtype('float32'), 'uid_concept_type_accrate_1': np.dtype('float32'), 'uid_concept_type_accrate_2': np.dtype('float32'), 'uid_concept_type_accrate_3': np.dtype('float32'), 'uid_concept_type_accrate_4': np.dtype('float32'), 'uid_concept_type_accrate_5': np.dtype('float32'), 'uid_concept_type_accrate_6': np.dtype('int8'), 'uidconcept_min_accrate': np.dtype('float32'), 'uidconcept_mean_accrate': np.dtype('float32'), 'uidconcept_max_accrate': np.dtype('float32'), 'uidconcepttype_min_accrate': np.dtype('float32'), 'uidconcepttype_mean_accrate': np.dtype('float32'), 'uidconcepttype_max_accrate': np.dtype('float32'), 'uid_kc_accrate_1': np.dtype('float32'), 'uid_kc_accrate_2': np.dtype('float32'), 'uid_kc_accrate_3': np.dtype('float32'), 'uid_kc_accrate_4': np.dtype('float32'), 'uid_kc_accrate_5': np.dtype('float32'), 'uid_kc_accrate_6': np.dtype('float32'), 'uid_kc_accrate_7': np.dtype('float32'), 'uid_kc_accrate_8': np.dtype('float32'), 'uid_kc_type_accrate_1': np.dtype('float32'), 'uid_kc_type_accrate_2': np.dtype('float32'), 'uid_kc_type_accrate_3': np.dtype('float32'), 'uid_kc_type_accrate_4': np.dtype('float32'), 'uid_kc_type_accrate_5': np.dtype('float32'), 'uid_kc_type_accrate_6': np.dtype('float32'), 'uid_kc_type_accrate_7': np.dtype('float32'), 'uid_kc_type_accrate_8': np.dtype('float32'), 'uidkc_min_accrate': np.dtype('float32'), 'uidkc_mean_accrate': np.dtype('float32'), 'uidkc_max_accrate': np.dtype('float32'), 'uidkctype_min_accrate': np.dtype('float32'), 'uidkctype_mean_accrate': np.dtype('float32'), 'uidkctype_max_accrate': np.dtype('float32'), 'hours_to_half_split_timestamp': np.dtype('int32'), 'records_to_half_split_uid_record_cumsum': np.dtype('int16'), 'cnt_finishsimiliar': np.dtype('float32'), 'acctop5_finishsimiliar': np.dtype('float32'), 'acctop1_finishsimiliar': np.dtype('float32'), 'cnt_accsimiliar': np.dtype('float32'), 'acctop5_accsimiliar': np.dtype('float32'), 'acctop1_accsimiliar': np.dtype('float32'), 'cnt_finishstrictsimiliar': np.dtype('float32'), 'acctop5_finishstrictsimiliar': np.dtype('float32'), 'acctop1_finishstrictsimiliar': np.dtype('float32'), 'cnt_accstrictsimiliar': np.dtype('float32'), 'acctop5_accstrictsimiliar': np.dtype('float32'), 'acctop1_accstrictsimiliar': np.dtype('float32'), 'accrate_similiar': np.dtype('float32'), 'accrate_similiar_strict': np.dtype('float32'), 'acctop5max_similiar': np.dtype('float32'), 'acctop1max_similiar': np.dtype('float32'), 'acctop5max_similiar_strict': np.dtype('float32'), 'acctop1max_similiar_strict': np.dtype('float32')}

useful_features = ['uid_concept_utilnow_94608000_desc', 'time_hour_accrate', 'concepttype_mean_showcnt', 'time_weekday', 
'uid_timestamp_showcnt', 'concept_min_accrate', 'uid_accrate', 'question_accrate', 'kc_showcnt_4', 
'question_showcnt', 'uid_concept_utilnow_3600_asc', 'concept_max_accrate', 'kc_type_showcnt_1', 'kc_3', 
'concept_type_accrate_1', 'uid_showcnt', 'time_day', 'question', 'timestamp_question_cnt', 'concept_mean_accrate', 
'uid_concept_utilnow_21600_asc', 'uid_concept_accrate_4', 'kc_2', 'uidconcepttype_max_accrate', 'type_accrate', 
'analysis_cnt', 'kc_max_accrate', 'concept_min_showcnt', 'uidkc_mean_accrate'] + \
    ['uid_accrate', 'question_accrate', 'timestamp_question_cnt', 'question', 'timestamp', 'time_day', 
 'uid_new_question_days', 'uid_kc_utilnow_3600_asc', 'uid_timestamp_showcnt', 'uid_concept_utilnow_21600_asc', 
 'analysis_cnt', 'new_concept_days_min', 'concepttype_min_accrate', 'concept_mean_accrate', 'question_showcnt', 
 'time_hour_accrate', 'uidkctype_min_accrate', 'uid_concept_accrate_5'] + \
    ['uid','question','timestamp' ,'response','real_response','is_enhance'] + \
    ['cnt_finishsimiliar', 'acctop5_finishsimiliar',
       'acctop1_finishsimiliar', 'cnt_accsimiliar', 'acctop5_accsimiliar',
       'acctop1_accsimiliar', 'cnt_finishstrictsimiliar',
       'acctop5_finishstrictsimiliar', 'acctop1_finishstrictsimiliar',
       'cnt_accstrictsimiliar', 'acctop5_accstrictsimiliar',
       'acctop1_accstrictsimiliar', 'accrate_similiar',
       'accrate_similiar_strict', 'acctop5max_similiar',
       'acctop1max_similiar', 'acctop5max_similiar_strict',
       'acctop1max_similiar_strict']
useful_features = list(set(useful_features))

def feature_engineering3(df_tr, df_test, df_val=None, tr_data_enhance=1, val_data_enhance=1, seed=1, save_files=None):
    # 不同统计方式得到的特征，用于不同类型的模型
    columns = {
        'at':[], # 可以直接使用的特征
        'bs':[], # 统计这次提交以前的label
        'bt':[], # 统计今天以前的label
        'bh':[]  # 只使用一半的label进行统计
    }
    gc.collect()
    
    # 开始统计每个人的个人信息（数据增强）,append_columns:是否把列名加入columns字典
    def calc_uid_features(df, append_columns=False, data_type='tr'):
        
        df_response = df[df['response']!=-1]
        
        if data_type in ('tr','val'):
            col = 'uid'
            df = df.drop([col+'_showcnt',col+'_accrate'], axis=1)
            tmp = df_response.groupby(col)['response'].agg(['count','mean']).rename(columns={'count':col+'_showcnt', 'mean':col+'_accrate'})
            df=df.merge(tmp, how='left', left_on=col, right_index=True, suffixes=(None,'_1'))
            columns['bh'].extend([col+'_showcnt',col+'_accrate'])
            
                
        #""" 不work
        # concept : df_response 的累计正确率
        col = 'uid'
        df_tmp = pd.concat([
            df_response.loc[df_response['concept_1']!=-1,['uid','response','concept_1','type']].rename(columns={'concept_1':'concept'}),
            df_response.loc[df_response['concept_2']!=-1,['uid','response','concept_2','type']].rename(columns={'concept_2':'concept'}),
            df_response.loc[df_response['concept_3']!=-1,['uid','response','concept_3','type']].rename(columns={'concept_3':'concept'}),
            df_response.loc[df_response['concept_4']!=-1,['uid','response','concept_4','type']].rename(columns={'concept_4':'concept'}),
            df_response.loc[df_response['concept_5']!=-1,['uid','response','concept_5','type']].rename(columns={'concept_5':'concept'}),
            df_response.loc[df_response['concept_6']!=-1,['uid','response','concept_6','type']].rename(columns={'concept_6':'concept'}),
        ])
        col2 = 'concept'
        tmp = df_tmp.groupby([col, col2])['response'].agg('mean').rename(col+'_'+col2+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_4'], right_index=True, suffixes=(None,'_4')).\
                merge(tmp, how='left', left_on=[col,col2+'_5'], right_index=True, suffixes=(None,'_5')).\
                rename(columns={col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_4'}).fillna(-1)
        col3 = 'type'  # {'count':len,'mean':np.mean}
        tmp = df_tmp.groupby([col,col2,col3])['response'].agg('mean').rename(col+'_'+col2+'_'+col3+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_1',col3], right_index=True, suffixes=(None,'_1'))\
            .merge(tmp, how='left', left_on=[col,col2+'_2',col3], right_index=True, suffixes=(None,'_2'))\
            .merge(tmp, how='left', left_on=[col,col2+'_3',col3], right_index=True, suffixes=(None,'_3'))\
            .merge(tmp, how='left', left_on=[col,col2+'_4',col3], right_index=True, suffixes=(None,'_4'))\
            .merge(tmp, how='left', left_on=[col,col2+'_5',col3], right_index=True, suffixes=(None,'_5'))\
            .merge(tmp, how='left', left_on=[col,col2+'_6',col3], right_index=True, suffixes=(None,'_6'))\
            .rename(columns={col+'_'+col2+'_'+col3+'_accrate':col+'_'+col2+'_'+col3+'_accrate'+'_1'}).fillna(-1)
        for i in range(1,7):
            columns['bh'].extend([col+'_'+col2+'_'+col3+'_accrate_'+str(i),col+'_'+col2+'_accrate_'+str(i)])
        df[col+col2+col3+'_max_accrate'] = df[['uid_concept_type_accrate_1','uid_concept_type_accrate_2','uid_concept_type_accrate_3','uid_concept_type_accrate_4','uid_concept_type_accrate_5','uid_concept_type_accrate_6']].parallel_apply(lambda x: max([xx if xx!=-1 else -9 for xx in x]), axis=1)
        columns['at'].extend([col+col2+col3+'_max_accrate'])
        
    
        # kc : df_response 的累计正确率
        df_tmp = pd.concat([
            df_response.loc[df_response['kc_1']!=-1,['uid','response','kc_1','timestamp','type']].rename(columns={'kc_1':'kc'}),
            df_response.loc[df_response['kc_2']!=-1,['uid','response','kc_2','timestamp','type']].rename(columns={'kc_2':'kc'}),
            df_response.loc[df_response['kc_3']!=-1,['uid','response','kc_3','timestamp','type']].rename(columns={'kc_3':'kc'}),
            df_response.loc[df_response['kc_4']!=-1,['uid','response','kc_4','timestamp','type']].rename(columns={'kc_4':'kc'}),
            df_response.loc[df_response['kc_5']!=-1,['uid','response','kc_5','timestamp','type']].rename(columns={'kc_5':'kc'}),
            df_response.loc[df_response['kc_6']!=-1,['uid','response','kc_6','timestamp','type']].rename(columns={'kc_6':'kc'}),
            df_response.loc[df_response['kc_7']!=-1,['uid','response','kc_7','timestamp','type']].rename(columns={'kc_7':'kc'}),
            df_response.loc[df_response['kc_8']!=-1,['uid','response','kc_8','timestamp','type']].rename(columns={'kc_8':'kc'}),
        ])
        gc.collect()
        col2 = 'kc'
        tmp = df_tmp.groupby([col, col2])['response'].agg('mean').rename(col+'_'+col2+'_accrate')
        df=df.merge(tmp, how='left', left_on=[col,col2+'_1'], right_index=True, suffixes=(None,'_1'))\
            .merge(tmp, how='left', left_on=[col,col2+'_2'], right_index=True, suffixes=(None,'_2'))\
            .merge(tmp, how='left', left_on=[col,col2+'_3'], right_index=True, suffixes=(None,'_3'))\
            .merge(tmp, how='left', left_on=[col,col2+'_4'], right_index=True, suffixes=(None,'_4'))\
            .merge(tmp, how='left', left_on=[col,col2+'_5'], right_index=True, suffixes=(None,'_5'))\
            .merge(tmp, how='left', left_on=[col,col2+'_6'], right_index=True, suffixes=(None,'_6'))\
            .merge(tmp, how='left', left_on=[col,col2+'_7'], right_index=True, suffixes=(None,'_7'))\
            .merge(tmp, how='left', left_on=[col,col2+'_8'], right_index=True, suffixes=(None,'_8'))\
            .rename(columns={col+'_'+col2+'_accrate':col+'_'+col2+'_accrate'+'_1'})
        for i in range(1,9):
            columns['bh'].extend([col+'_'+col2+'_'+col3+'_accrate_'+str(i), col+'_'+col2+'_accrate_'+str(i)])
        df[col+col2+'_mean_accrate'] = df[['uid_kc_accrate_1','uid_kc_accrate_2','uid_kc_accrate_3','uid_kc_accrate_4','uid_kc_accrate_5','uid_kc_accrate_6','uid_kc_accrate_7','uid_kc_accrate_8']].parallel_apply(lambda x: sum([xx for xx in x if xx!=-1])/(len([xx for xx in x if xx!=-1])+0.0001), axis=1)
        columns['at'].extend([col+col2+'_min_accrate',col+col2+'_mean_accrate',col+col2+'_max_accrate'])
        
        del df_tmp
        gc.collect()
        
        # 找到相似题目，映射到原特征中，计算相似题目做过/会做多少；top1、top3 accrate
        def tmp_func(df, timelimit_value=3600000, columns_need_set=None):
            if timelimit_value<0:
                idx_raw = df.index
                df = df.iloc[::-1].reset_index(drop=True)
            timelimit = str(timelimit_value)
            from collections import defaultdict
            global question_interact_ok,question_interact,question_interact_true_ok,question_interact_true
            global question_similiar_reverse,question_true_similiar_reverse,question_similiar_reverse_strict,question_true_similiar_reverse_strict
            global question_info_sim,question_info_sim_strict
            finished_questions = defaultdict(int)
            true_questions = defaultdict(int)
            start = -1
            res = {}
            # 注册要使用的列名
            res['accrate_similiar_strict'] = []
            for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                res['cnt_'+similiar_name] = []
                res['acctop5_'+similiar_name] = []
                res['acctop1_'+similiar_name] = []
                for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                    res['cnt_'+similiar_name+'_'+contentsim_name] = []
                    res['acctop5_'+similiar_name+'_'+contentsim_name] = []
                    res['acctop1_'+similiar_name+'_'+contentsim_name] = []

            for idx, (question, response, timestamp) in df[['question', 'response', 'timestamp']].iterrows():
                # 超时的数据排除出去
                if start == -1:
                    start = 0 if timelimit_value>0 else len(df)-1
                else:
                    while (timelimit_value>0 and timestamp - df['timestamp'].iloc[start] > timelimit_value) or (timelimit_value<0 and df['timestamp'].iloc[start]-timestamp > -timelimit_value):
                        if df['response'].iloc[start] == 1:
                            true_questions[df['question'].iloc[start]] -= 1
                        finished_questions[df['question'].iloc[start]] -= 1
                        start += (1 if timelimit_value>0 else -1)

                cols_here = ['cnt_finishsimiliar','acctop5_finishsimiliar','acctop1_finishsimiliar','acctop1max_similiar','acctop5max_similiar_strict','accrate_similiar','acctop5max_similiar']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_similiar_list = sorted([acc for i,acc in question_similiar_reverse[question].items() for _ in range(finished_questions[i])], reverse=True)
                    res['cnt_finishsimiliar'].append(len(question_similiar_list))
                    res['acctop5_finishsimiliar'].append(sum(question_similiar_list[:5])/(len(question_similiar_list[:5])+0.000001))
                    res['acctop1_finishsimiliar'].append(sum(question_similiar_list[:1])/(len(question_similiar_list[:1])+0.000001))
                
                cols_here = ['cnt_accsimiliar','acctop5_accsimiliar','acctop1_accsimiliar','accrate_similiar','acctop5max_similiar','acctop1max_similiar','acctop5max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_true_similiar_list = sorted([acc for i,acc in question_true_similiar_reverse[question].items() for _ in range(true_questions[i])], reverse=True)
                    res['cnt_accsimiliar'].append(len(question_true_similiar_list))
                    res['acctop5_accsimiliar'].append(sum(question_true_similiar_list[:5])/(len(question_true_similiar_list[:5])+0.000001))
                    res['acctop1_accsimiliar'].append(sum(question_true_similiar_list[:1])/(len(question_true_similiar_list[:1])+0.000001))
                
                cols_here = ['cnt_finishstrictsimiliar','acctop5_finishstrictsimiliar','acctop1_finishstrictsimiliar',
                             'accrate_similiar_strict','acctop1max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_similiar_strict_list = sorted([acc for i,acc in question_similiar_reverse_strict[question].items() for _ in range(finished_questions[i])], reverse=True)
                    res['cnt_finishstrictsimiliar'].append(len(question_similiar_strict_list))
                    res['acctop5_finishstrictsimiliar'].append(sum(question_similiar_strict_list[:5])/(len(question_similiar_strict_list[:5])+0.000001))
                    res['acctop1_finishstrictsimiliar'].append(sum(question_similiar_strict_list[:1])/(len(question_similiar_strict_list[:1])+0.000001))
                
                cols_here = ['cnt_accstrictsimiliar','acctop5_accstrictsimiliar','acctop1_accstrictsimiliar',
                             'accrate_similiar_strict','acctop1max_similiar_strict']
                if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                    question_true_similiar_strict_list = sorted([acc for i,acc in question_true_similiar_reverse_strict[question].items() for _ in range(true_questions[i])], reverse=True)
                    res['cnt_accstrictsimiliar'].append(len(question_true_similiar_strict_list))
                    res['acctop5_accstrictsimiliar'].append(sum(question_true_similiar_strict_list[:5])/(len(question_true_similiar_strict_list[:5])+0.000001))
                    res['acctop1_accstrictsimiliar'].append(sum(question_true_similiar_strict_list[:1])/(len(question_true_similiar_strict_list[:1])+0.000001))

                for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                    cols_here = ['cnt_finishsimiliar'+'_'+contentsim_name,'acctop5_finishsimiliar'+'_'+contentsim_name,'acctop1_finishsimiliar'+'_'+contentsim_name,
                                 'acctop1max_similiar'+'_'+contentsim_name,'acctop5max_similiar_strict'+'_'+contentsim_name,'accrate_similiar'+'_'+contentsim_name,'acctop5max_similiar'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_similiar_list = sorted([acc for i,acc in question_similiar_reverse[question].items() if i in contentsim[question] for _ in range(finished_questions[i])], reverse=True)
                        res['cnt_finishsimiliar'+'_'+contentsim_name].append(len(question_similiar_list))
                        res['acctop5_finishsimiliar'+'_'+contentsim_name].append(sum(question_similiar_list[:5])/(len(question_similiar_list[:5])+0.000001))
                        res['acctop1_finishsimiliar'+'_'+contentsim_name].append(sum(question_similiar_list[:1])/(len(question_similiar_list[:1])+0.000001))

                    cols_here = ['cnt_accsimiliar'+'_'+contentsim_name,'acctop5_accsimiliar'+'_'+contentsim_name,'acctop1_accsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar'+'_'+contentsim_name,'acctop5max_similiar'+'_'+contentsim_name,'acctop1max_similiar'+'_'+contentsim_name,'acctop5max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_true_similiar_list = sorted([acc for i,acc in question_true_similiar_reverse[question].items() if i in contentsim[question] for _ in range(true_questions[i])], reverse=True)
                        res['cnt_accsimiliar'+'_'+contentsim_name].append(len(question_true_similiar_list))
                        res['acctop5_accsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_list[:5])/(len(question_true_similiar_list[:5])+0.000001))
                        res['acctop1_accsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_list[:1])/(len(question_true_similiar_list[:1])+0.000001))

                    cols_here = ['cnt_finishstrictsimiliar'+'_'+contentsim_name,'acctop5_finishstrictsimiliar'+'_'+contentsim_name,'acctop1_finishstrictsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar_strict'+'_'+contentsim_name,'acctop1max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_similiar_strict_list = sorted([acc for i,acc in question_similiar_reverse_strict[question].items() if i in contentsim[question] for _ in range(finished_questions[i])], reverse=True)
                        res['cnt_finishstrictsimiliar'+'_'+contentsim_name].append(len(question_similiar_strict_list))
                        res['acctop5_finishstrictsimiliar'+'_'+contentsim_name].append(sum(question_similiar_strict_list[:5])/(len(question_similiar_strict_list[:5])+0.000001))
                        res['acctop1_finishstrictsimiliar'+'_'+contentsim_name].append(sum(question_similiar_strict_list[:1])/(len(question_similiar_strict_list[:1])+0.000001))

                    cols_here = ['cnt_accstrictsimiliar'+'_'+contentsim_name,'acctop5_accstrictsimiliar'+'_'+contentsim_name,'acctop1_accstrictsimiliar'+'_'+contentsim_name,
                                 'accrate_similiar_strict'+'_'+contentsim_name,'acctop1max_similiar_strict'+'_'+contentsim_name]
                    if columns_need_set is None or np.sum([1 if x in columns_need_set else 0 for x in cols_here])>0:
                        question_true_similiar_strict_list = sorted([acc for i,acc in question_true_similiar_reverse_strict[question].items() if i in contentsim[question] for _ in range(true_questions[i])], reverse=True)
                        res['cnt_accstrictsimiliar'+'_'+contentsim_name].append(len(question_true_similiar_strict_list))
                        res['acctop5_accstrictsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_strict_list[:5])/(len(question_true_similiar_strict_list[:5])+0.000001))
                        res['acctop1_accstrictsimiliar'+'_'+contentsim_name].append(sum(question_true_similiar_strict_list[:1])/(len(question_true_similiar_strict_list[:1])+0.000001))
                    
                if response == 1:
                    true_questions[question] += 1
                finished_questions[question] += 1

            for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                if 'cnt_'+similiar_name in columns_need_set or \
                    ('accrate_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('accrate_similiar_strict' in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                    df['cnt_'+similiar_name+'_'+timelimit] = res['cnt_'+similiar_name]
                if 'acctop5_'+similiar_name in columns_need_set or \
                    ('acctop5max_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')): 
                    df['acctop5_'+similiar_name+'_'+timelimit] = res['acctop5_'+similiar_name]
                if 'acctop1_'+similiar_name in columns_need_set or \
                    ('acctop1max_similiar' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('acctop5max_similiar_strict' in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                    ('acctop1max_similiar_strict' in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                    df['acctop1_'+similiar_name+'_'+timelimit] = res['acctop1_'+similiar_name]
            if 'accrate_similiar' in columns_need_set: df['accrate_similiar'+'_'+timelimit] = df['cnt_accsimiliar'+'_'+timelimit]/(df['cnt_finishsimiliar'+'_'+timelimit]+0.000001)    
            if 'accrate_similiar_strict' in columns_need_set: df['accrate_similiar_strict'+'_'+timelimit] = df['cnt_accstrictsimiliar'+'_'+timelimit]/(df['cnt_finishstrictsimiliar'+'_'+timelimit]+0.000001)
            if 'acctop5max_similiar' in columns_need_set: df['acctop5max_similiar'+'_'+timelimit] = df[['acctop5_finishsimiliar'+'_'+timelimit,'acctop5_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop1max_similiar' in columns_need_set: df['acctop1max_similiar'+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+timelimit,'acctop1_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop5max_similiar_strict' in columns_need_set: df['acctop5max_similiar_strict'+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+timelimit,'acctop1_accsimiliar'+'_'+timelimit]].max(axis=1)    
            if 'acctop1max_similiar_strict' in columns_need_set: df['acctop1max_similiar_strict'+'_'+timelimit] = df[['acctop1_finishstrictsimiliar'+'_'+timelimit,'acctop1_accstrictsimiliar'+'_'+timelimit]].max(axis=1)

            for contentsim_name, contentsim in (('contentsim',question_info_sim),('contentsimstrict',question_info_sim_strict)):
                for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                    if 'cnt_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('accrate_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('accrate_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                        df['cnt_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['cnt_'+similiar_name+'_'+contentsim_name]
                    if 'acctop5_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('acctop5max_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')): 
                        df['acctop5_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['acctop5_'+similiar_name+'_'+contentsim_name]
                    if 'acctop1_'+similiar_name+'_'+contentsim_name in columns_need_set or \
                        ('acctop1max_similiar'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accsimiliar','finishsimiliar')) or \
                        ('acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set and similiar_name in ('accstrictsimiliar','finishstrictsimiliar')): 
                        df['acctop1_'+similiar_name+'_'+contentsim_name+'_'+timelimit] = res['acctop1_'+similiar_name+'_'+contentsim_name]
                if 'accrate_similiar'+'_'+contentsim_name in columns_need_set: df['accrate_similiar'+'_'+contentsim_name+'_'+timelimit] = df['cnt_accsimiliar'+'_'+contentsim_name+'_'+timelimit]/(df['cnt_finishsimiliar'+'_'+contentsim_name+'_'+timelimit]+0.000001)    
                if 'accrate_similiar_strict'+'_'+contentsim_name in columns_need_set: df['accrate_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df['cnt_accstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]/(df['cnt_finishstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]+0.000001)
                if 'acctop5max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar'+'_'+contentsim_name+'_'+timelimit] = df[['acctop5_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop5_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop1max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)    
                if 'acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar_strict'+'_'+contentsim_name+'_'+timelimit] = df[['acctop1_finishstrictsimiliar'+'_'+contentsim_name+'_'+timelimit,'acctop1_accstrictsimiliar'+'_'+contentsim_name+'_'+timelimit]].max(axis=1)
            if timelimit_value<0:
                df = df.iloc[::-1].set_index(idx_raw)
            for contentsim_name in ('_contentsim','_contentsimstrict',''):
                for similiar_name in ('finishsimiliar','accsimiliar','finishstrictsimiliar','accstrictsimiliar'):
                    if 'cnt_'+similiar_name+'_'+contentsim_name in columns_need_set: df['cnt_'+similiar_name+contentsim_name+'_'+timelimit] = df['cnt_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                    if 'acctop5_'+similiar_name+'_'+contentsim_name in columns_need_set: df['acctop5_'+similiar_name+contentsim_name+'_'+timelimit] = df['acctop5_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                    if 'acctop1_'+similiar_name+'_'+contentsim_name in columns_need_set: df['acctop1_'+similiar_name+contentsim_name+'_'+timelimit] = df['acctop1_'+similiar_name+contentsim_name+'_'+timelimit].astype('float32')
                if 'accrate_similiar'+'_'+contentsim_name in columns_need_set: df['accrate_similiar'+contentsim_name+'_'+timelimit] = df['accrate_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'accrate_similiar_strict'+'_'+contentsim_name in columns_need_set: df['accrate_similiar_strict'+contentsim_name+'_'+timelimit] = df['accrate_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop5max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar'+contentsim_name+'_'+timelimit] = df['acctop5max_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop1max_similiar'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar'+contentsim_name+'_'+timelimit] = df['acctop1max_similiar'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop5max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop5max_similiar_strict'+contentsim_name+'_'+timelimit] = df['acctop5max_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                if 'acctop1max_similiar_strict'+'_'+contentsim_name in columns_need_set: df['acctop1max_similiar_strict'+contentsim_name+'_'+timelimit] = df['acctop1max_similiar_strict'+contentsim_name+'_'+timelimit].astype('float32')
                    
            return df

        
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '31536000000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=365*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '1209600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=14*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '86400000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-' not in x and '3600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-1209600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=-14*24*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        columns_need_set = set(['_'.join(x.split('_')[:-1]) for x in columns_sim_res[1] if '-21600000' in x])
        df = df.groupby('uid').parallel_apply(partial(tmp_func, timelimit_value=-6*3600*1000,columns_need_set=columns_need_set)).reset_index(drop=True)
        
        # 训练、验证集：只需要保存需要预测的就行了
        if data_type in ['tr','val']:
        #if data_type in ['val']:
            df = df[df['response']==-1]

        df.fillna(-1, inplace=True)
        
        # 压缩数据，压缩为指定格式
        # df = df[useful_features]
        df=df[list(set(kf_or+['uid','timestamp','question','real_response','response','is_enhance','uid_record_cumsum','uid_record_sum']))]
        for col,dtype in df.dtypes.items():
            if col in useful_features_dtypes:
                df[col] = df[col].astype(useful_features_dtypes[col])
            elif dtype == 'int64':
                if col not in ['timestamp','is_enhance']:
                    if df[col].max()<=32767 and df[col].min()>=-32768:
                        df[col] = df[col].astype('int16')
                    else:
                        df[col] = df[col].astype('int32')
            elif dtype == 'float64':
                df[col] = df[col].astype('float32')
        
        return df
    
    df_tr['real_response'] = df_tr['response']
    df_tr['is_enhance'] = 0
    df_test['real_response'] = df_test['response']
    df_test['is_enhance'] = 0
    
    # 计算sim矩阵
    df_tr_tmp = df_tr.copy()
    df_tr_tmp.loc[(df_tr_tmp['uid_record_cumsum']>=df_tr_tmp['uid_record_sum']*0.5)&(df_tr_tmp['uid_record_cumsum']>=90), 'response'] = -1
    get_sim_metrix(df_tr_tmp, df_test)
    del df_tr_tmp
    gc.collect()
    
    df_test = calc_uid_features(df_test, append_columns=True, data_type='test')
    if save_files is not None:
        df_test.reset_index(drop=True).to_feather(save_files[2])
    gc.collect()

    data_type, data_enhance = ('val',val_data_enhance)
    if df_val is not None and data_enhance>0:
        df_val['real_response'] = df_val['response']
        df_val['is_enhance'] = 0
        dfs = []
        rng = np.random.default_rng(seed)
        for _ in range(0, data_enhance):
            print('val enhance', _)
            # 只选取范围内（尝试截断，比如k不能大于3/4）
            df_tmp = df_val.copy()
            k = rng.random()*0.5+0.25 if _ != 0 else 0.5
            df_tmp.loc[(df_tmp['uid_record_cumsum']>=df_tmp['uid_record_sum']*k)&(df_tmp['uid_record_cumsum']>=90), 'response'] = -1
            # 超过k*2范围的数据就不要了
            tmp = calc_uid_features(df_tmp[(df_tmp['uid_record_cumsum']<=df_tmp['uid_record_sum']*k*2)], data_type=data_type)
            tmp['is_enhance'] = _
            dfs.append(tmp)
            gc.collect()
        df_val = pd.concat(dfs, axis=0)
        if save_files is not None:
            df_val.reset_index(drop=True).to_feather(save_files[1])
        gc.collect()
    
    data_type, data_enhance = ('tr',tr_data_enhance)
    if data_enhance>0:
        dfs = []
        rng = np.random.default_rng(seed)
        for _ in range(0, data_enhance):
            print('tr enhance', _)
            # 只选取范围内（尝试截断，比如k不能大于3/4）
            df_tmp = df_tr.copy()
            k = rng.random()*0.5+0.25 if _ != 0 else 0.5
            df_tmp.loc[(df_tmp['uid_record_cumsum']>=df_tmp['uid_record_sum']*k)&(df_tmp['uid_record_cumsum']>=90), 'response'] = -1
            # 更新sim矩阵
            get_sim_metrix(df_tmp, df_test)
            # 超过k*2范围的数据就不要了
            tmp = calc_uid_features(df_tmp[(df_tmp['uid_record_cumsum']<=df_tmp['uid_record_sum']*k*2)], data_type=data_type)
            tmp = tmp.copy()
            tmp.reset_index(drop=True, inplace=True)
            tmp['is_enhance'] = _
            dfs.append(tmp)
            gc.collect()
        print('merging df_tr')
        df_tr = pd.concat(dfs, axis=0)
        del dfs
        if save_files is not None:
            print('resetindex df_tr')
            df_tr.reset_index(drop=True, inplace=True)
            print('tofeather df_tr')
            df_tr.to_feather(save_files[0])
        gc.collect()
    gc.collect()
    
    if save_files is not None:
        return 
    if df_val is not None:
        return df_tr, df_val, df_test, columns
    else:
        return df_tr, df_test, columns

"""
df_tmp,df_tmp2, df_tmp3, columns_tmp = feature_engineering2(df.iloc[:3000].copy(), df_test.iloc[:3000].copy(), df_val=df.iloc[3000:6000].copy(), 
                                         tr_data_enhance=2, val_data_enhance=1)
print(columns_tmp)
df_tmp
"""

'\ndf_tmp,df_tmp2, df_tmp3, columns_tmp = feature_engineering2(df.iloc[:3000].copy(), df_test.iloc[:3000].copy(), df_val=df.iloc[3000:6000].copy(), \n                                         tr_data_enhance=2, val_data_enhance=1)\nprint(columns_tmp)\ndf_tmp\n'

In [21]:
#df_tr2,df_test2, columns_tmp = feature_engineering2(df.copy(), df_test.copy(), df_val=None, tr_data_enhance=0, val_data_enhance=0)
#df_test2.reset_index(drop=True).to_feather(f'./input/cache/cache_df_test_vj-sim2.101_1_0_tmp.feather')

## 模型

### lgb

In [19]:
emb_p = pd.read_csv('input/question_w2v_emb_all_test_window256_iter10.csv')  # 'item', 'embedding''18.2343,13.352396,-4.3354278,27.098478,25.468166'
def split_emb_func(row):
    row['emb1'],row['emb2'],row['emb3'],row['emb4'],row['emb5'] = map(float,row['embedding'].split(','))
    return row
emb_p = emb_p.apply(split_emb_func,axis=1)

In [10]:
def try_predict(df, columns, y_col, lgb_params, kf, metrics, df_test=None, 
                debug=0, is_try=0, verbose=1, without_columns=None, with_columns=None, tr_data_enhance=3, val_data_enhance=1, 
                seed=1, version='v1', use_focal_loss=False, weight_func=None, only_save=False):
    gc.collect()
    if debug==0:
        debug = 9999999999
    #y_hat = np.zeros(df.iloc[:debug].shape[0])
    if df_test is not None:
        y_test = np.zeros(df_test.shape[0])
    res = []
    for kf_i, (tr_idx,val_idx) in enumerate(kf):
        if is_try>0 and kf_i+1 != is_try:
            continue
        df_tr, df_val = df.iloc[tr_idx[:debug]], df.iloc[val_idx[:debug]]
        
        # 特征工程
        if verbose: print(datetime.datetime.now(), 'feature_engineering...')
        if not os.path.exists('./input/cache'): os.mkdir('./input/cache')
        if not os.path.exists(f'./input/cache/cache_df_tr_{str(version)}_{str(seed)}_{str(kf_i)}.feather'):
            save_files = [f'./input/cache/cache_df_tr_{str(version)}_{str(seed)}_{str(kf_i)}.feather',
                          f'./input/cache/cache_df_val_{str(version)}_{str(seed)}_{str(kf_i)}.feather',
                         f'./input/cache/cache_df_test_{str(version)}_{str(seed)}_{str(kf_i)}.feather']
            #df_tr, df_val, df_test2, columns2 = 
            feature_engineering2(df_tr.copy(), df_test.copy(), df_val=df_val.copy(), 
                                                      tr_data_enhance=tr_data_enhance, val_data_enhance=val_data_enhance,seed=seed,save_files=save_files)
            gc.collect()
            #df_tr.reset_index(drop=True).to_feather(f'./input/cache/cache_df_tr_{str(version)}_{str(seed)}_{str(kf_i)}.feather')
            #df_val.reset_index(drop=True).to_feather(f'./input/cache/cache_df_val_{str(version)}_{str(seed)}_{str(kf_i)}.feather')
            #df_test2.reset_index(drop=True).to_feather(f'./input/cache/cache_df_test_{str(version)}_{str(seed)}_{str(kf_i)}.feather')
        else:
            df_tr = pd.read_feather(f'./input/cache/cache_df_tr_{str(version)}_{str(seed)}_{str(kf_i)}.feather').reset_index(drop=True)
            df_val = pd.read_feather(f'./input/cache/cache_df_val_{str(version)}_{str(seed)}_{str(kf_i)}.feather').reset_index(drop=True)
            df_test2 = pd.read_feather(f'./input/cache/cache_df_test_{str(version)}_{str(seed)}_{str(kf_i)}.feather').reset_index(drop=True)
        df_tr = df_tr.merge(emb_p[['item','emb1','emb2','emb3','emb4','emb5']],how='left',left_on='question',right_on='item')
        df_val = df_val.merge(emb_p[['item','emb1','emb2','emb3','emb4','emb5']],how='left',left_on='question',right_on='item')
        df_test2 = df_test2.merge(emb_p[['item','emb1','emb2','emb3','emb4','emb5']],how='left',left_on='question',right_on='item')


        x_cols = [x for x in df_tr.columns if x not in [y_col]+without_columns] if with_columns is None else with_columns+['emb1','emb2','emb3','emb4','emb5']

        if only_save:
            continue 
        if weight_func is not None:
            weight_tr, weight_val = weight_func(df_tr), weight_func(df_val)
        else:
            weight_tr, weight_val = None, None
        
        df_test2.loc[df_test2['question_accrate']==-1, 'question_accrate'] = np.nan
        
        # lgb
        lgb_train = lgb.Dataset(df_tr.loc[df_tr['response']==-1,x_cols].to_numpy(dtype='float32'), df_tr.loc[df_tr['response']==-1,y_col], weight=weight_tr)
        df_tr = None
        gc.collect()
        #lgb_train = lgb.Dataset(df_tr[x_cols], df_tr[y_col])
        lgb_val = lgb.Dataset(df_val.loc[df_val['response']==-1,x_cols].to_numpy(dtype='float32'), df_val.loc[df_val['response']==-1,y_col], weight=weight_val)
        gc.collect()
        #lgb_val = lgb.Dataset(df_val[x_cols], df_val[y_col])
        if verbose: print(datetime.datetime.now(), 'training...')
        if use_focal_loss:
            model = lgb.train(lgb_params,lgb_train,num_boost_round=100000,valid_sets=[lgb_val],early_stopping_rounds=30,verbose_eval=100,fobj=focal_loss) #,feval=eval_error)
        else:
            model = lgb.train(lgb_params,lgb_train,num_boost_round=100000,valid_sets=[lgb_val],early_stopping_rounds=30,verbose_eval=100)
        if verbose: print(datetime.datetime.now(), 'evaluating...')
        y_val_pred = model.predict(df_val[x_cols])
        score = metrics[0](df_val.loc[df_val['response']==-1, y_col], y_val_pred[np.where(df_val['response']==-1)[0]])
        res.append(list([metrics[0](df_val[y_col], y_val_pred), metrics[0](df_val[y_col], y_val_pred, sample_weight=weight_val)]) + 
                   list([metrics[0](df_val.loc[(df_val['response']==-1)&(df_val['is_enhance']==0), y_col], y_val_pred[np.where((df_val['response']==-1)&(df_val['is_enhance']==0))[0]]),
                        score]))
        #res.append(list([metric(df_val[y_col], y_val_pred, df_val[x_cols]) for metric in metrics]) )
        if verbose: print(datetime.datetime.now(), 'now score:', res)
        #y_hat[val_idx] = y_val_pred
        if df_test2 is not None:
            y_pred_test = model.predict(df_test2[x_cols])
            y_test += y_pred_test
            y_pred_test = pd.DataFrame(np.array(y_pred_test).reshape(-1,1), columns=['pred'])
            y_pred_test.to_feather(f'./output/pred_lgbms_{str(round(score,6))}_fold{kf_i}_{datetime.datetime.now().strftime("%y%m%d-%H%M%S")}_.feather')

        if is_try:
            break
    if df_test is not None:
        return res, y_test/(len(kf) if not is_try else 1.0)
    else:
        return res

In [15]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['auc'],
    'num_leaves': 474, 
    'max_depth': 12, 
    'learning_rate': 0.1407968542304061,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    "lambda_l2": 0.03796153986418542,
    "lambda_l1": 3.8922663169769365,
    'max_bin':152,
    'min_data_in_bin':110,
    'min_data_in_leaf':241,
    'min_gain_to_split':0.35000000000000003,
    'subsample':0.7402345943066192,
    "nthread": -1,
    'seed': 1,
    'verbosity': 1,
}
lgb_params_jiangxing1227={'bagging_fraction': 0.5, 'boosting': 'goss', 'feature_fraction': 0.7000000000000001, 'lambda_l1': 1.2023047772488058e-05, 'lambda_l2': 0.24166153734311607, 'learning_rate': 0.010847402166847181, 'max_bin': 248, 'max_depth': 11, 'metric': 'binary_logloss', 'min_data_in_bin': 256, 'min_data_in_leaf': 165, 'min_gain_to_split': 3.6, 'nthread': -1, 'num_leaves': 379, 'objective': 'cross_entropy', 'seed': 1, 'verbose': -1, 'verbosity': -1, 'top_rate': 0.08755113350652709, 'other_rate': 0.40917745426269403, 'subsample': 1.0}
lgb_params_jiangxing1230gbdt={'bagging_fraction': 0.45, 'boosting': 'gbdt', 'feature_fraction': 0.35000000000000003, 'lambda_l1': 4.5816175014092746e-05, 'lambda_l2': 5.8867950364954655, 'learning_rate': 0.03707980348379979, 'max_bin': 152, 'max_depth': 12, 'metric': 'auc', 'min_data_in_bin': 231, 'min_data_in_leaf': 47, 'min_gain_to_split': 2.95, 'nthread': -1, 'num_leaves': 369, 'objective': 'cross_entropy', 'seed': 1, 'verbose': -1, 'verbosity': -1, 'subsample': 0.7576548599070091}

without_columns = []

In [14]:
with_columns=['uid_concept_utilnow_94608000_desc', 'time_hour_accrate', 'concepttype_mean_showcnt', 'time_weekday', 'uid_timestamp_showcnt', 'concept_min_accrate', 'uid_accrate', 'question_accrate', 'kc_showcnt_4', 'question_showcnt', 'uid_concept_utilnow_3600_asc', 'concept_max_accrate', 'kc_type_showcnt_1', 'kc_3', 'concept_type_accrate_1', 'uid_showcnt', 'time_day', 'question', 'timestamp_question_cnt', 'concept_mean_accrate', 'uid_concept_utilnow_21600_asc', 'uid_concept_accrate_4', 'kc_2', 'uidconcepttype_max_accrate', 'type_accrate', 'analysis_cnt', 'kc_max_accrate', 'concept_min_showcnt', 'uidkc_mean_accrate'] + \
['cnt_finishsimiliar', 'acctop5_finishsimiliar',
       'acctop1_finishsimiliar', 'cnt_accsimiliar', 'acctop5_accsimiliar',
       'acctop1_accsimiliar', 'cnt_finishstrictsimiliar',
       'acctop5_finishstrictsimiliar', 'acctop1_finishstrictsimiliar',
       'cnt_accstrictsimiliar', 'acctop5_accstrictsimiliar',
       'acctop1_accstrictsimiliar', 'accrate_similiar',
       'accrate_similiar_strict', 'acctop5max_similiar',
       'acctop1max_similiar', 'acctop5max_similiar_strict',
       'acctop1max_similiar_strict']
without_columns = ['uid','response','timestamp','data_type','real_response','is_enhance']
with_columns = ['acctop5_finishstrictsimiliar_contentsim_-21600000', 'content_cnt', 'kc_type_accrate_5', 'accrate_similiar_31536000000', 'cnt_finishstrictsimiliar_31536000000', 'timestamp', 'uid_unique_submittimes_3600_asc', 'acctop1_accstrictsimiliar_31536000000', 'acctop1_finishstrictsimiliar_31536000000', 'uid_concept_type_accrate_6', 'uid_kc_accrate_6', 'concepttype_min_showcnt', 'kc_6', 'concept_min_showcnt', 'kc_type_showcnt_5', 'uid_question_utilnow_21600_asc', 'concept_mean_accrate', 'uid_question_utilnow_3600_asc', 'acctop5max_similiar_contentsim_-21600000', 'time_hour_accrate', 'uid_concept_utilnow_21600_asc', 'cnt_finishstrictsimiliar_contentsim_-21600000', 'acctop1_finishsimiliar_31536000000', 'acctop5_finishstrictsimiliar_31536000000', 'kc_showcnt_4', 'question_accrate', 'accrate_similiar_strict_31536000000', 'acctop5_finishstrictsimiliar_contentsimstrict_-21600000', 'acctop5_finishstrictsimiliar_-21600000', 'uid_question_utilnow_21600_desc', 'timestamp_question_cnt', 'new_question_days', 'uid_concept_utilnow_3600_asc', 'cnt_finishsimiliar_contentsim_-21600000', 'type_accrate', 'concept_type_accrate_1', 'uid_continue_learning_mins', 'concept_accrate_1', 'acctop1max_similiar_strict_contentsimstrict_-21600000', 'uid_timedistance_to_last_submit_desc', 'question_showcnt', 'acctop5max_similiar_strict_31536000000', 'uid_question_utilnow_1314000_desc', 'kc_type_showcnt_6', 'acctop5max_similiar_strict_-21600000', 'uid_concept_utilnow_259200_asc', 'acctop1_accsimiliar_31536000000', 'acctop5max_similiar_strict_contentsim_-21600000', 'cnt_finishstrictsimiliar_-21600000', 'acctop1max_similiar_strict_contentsim_-21600000', 'timediff1', 'uid_accrate', 'uid_concept_type_accrate_2', 'acctop5max_similiar_strict_contentsimstrict_-21600000', 'cnt_accstrictsimiliar_31536000000', 'uid_timestamp_question_meansubmit', 'kc_group_cnt', 'kc_type_accrate_7', 'cnt_accsimiliar_31536000000', 'uid_question_utilnow_259200_asc', 'uid_question_utilnow_31536000_asc', 'uid_timestamp_showcnt', 'acctop5_finishsimiliar_31536000000', 'cnt_finishsimiliar_contentsimstrict_-21600000', 'concept_type_showcnt_1', 'time_weekday_accrate', 'acctop5_accstrictsimiliar_31536000000', 'cnt_finishsimiliar_-21600000', 'acctop5max_similiar_contentsimstrict_-21600000', 'question', 'acctop1max_similiar_strict_-21600000']

In [None]:
def kfold_by_uid(df, n_splits=3, weight='uid-length'):
    # 加权采样，根据uid的长短进行加权
    if weight == 'uid-length':
        weight = df['uid'].value_counts()
        weight = pd.cut(weight, [0, 200, 350, 450, 10000], labels=False).reset_index().rename(columns={'uid':'weight_col','index':'uid'})  # index:uid, uid:桶编号
    elif weight == 'uid-accrate':
        weight = df.groupby('uid')['response'].mean()
        weight = pd.cut(weight, list(range(0,101,5)), labels=False).reset_index().rename(columns={'response':'weight_col'})
    # 采样
    kf_uid = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1).split(weight,weight['weight_col']))
    res = []
    for (kf_tr, kf_val) in kf_uid:
        tr_uid = set([weight.loc[i, 'uid'] for i in kf_tr])
        tr = np.where(df['uid'].isin(tr_uid))[0]
        val = np.where(~df['uid'].isin(tr_uid))[0]
        res.append((tr,val))
    return res

# 计算测试集的 uid_accrate 样本分布
df_test_uid_acc = [0]*20  # range(0,100,5)
for i,row in df_test.groupby(['uid'])['uid_accrate'].agg(['max','count']).iterrows():
    x = row['max']
    d = int(x//0.05) if int(x//0.05)<20 else 19
    df_test_uid_acc[d]+=row['count']
df_test_uid_acc = df_test_uid_acc / np.sum(df_test_uid_acc)
def weight_func(df):
    df_uid_acc = [0]*20  # range(0,100,5)
    for i,row in df.groupby(['uid'])['uid_accrate'].agg(['max','count']).iterrows():
        x = row['max']
        d = int(x//0.05) if int(x//0.05)<20 else 19
        df_uid_acc[d]+=row['count']
    df_uid_acc = df_uid_acc / np.sum(df_uid_acc)
    weight_bin = (df_test_uid_acc/df_uid_acc)
    weight_bin[weight_bin>5]=0
    return np.array(df['uid_accrate'].apply(lambda x: weight_bin[int(x//0.05)]))

In [None]:
kf_n_splits = 10
kf = kfold_by_uid(df, n_splits=kf_n_splits, weight='uid-length')
metric = lambda y,y_hat,*args,**argvs: roc_auc_score(y,y_hat)


# 生成数据
score, y_test1 = try_predict(df, columns, 'real_response', lgb_params, kf, [roc_auc_score], df_test=df_test, 
                             debug=0, is_try=0, verbose=1, without_columns=without_columns, with_columns=kf0, 
                             tr_data_enhance=10, val_data_enhance=1, seed=3, version='vj-sim3_10.101', use_focal_loss=False, weight_func=weight_func,
                            only_save=True)

for with_columns in [kf0,kf1,kf9]:
    for lgb_params in [lgb_params_jiangxing1227, lgb_params_jiangxing1230gbdt]:
        gc.collect()
        # 模型训练
        score, y_test1 = try_predict(df, columns, 'real_response', lgb_params, kf, [roc_auc_score], df_test=df_test, 
                                     debug=0, is_try=0, verbose=1, without_columns=without_columns, with_columns=with_columns, 
                                     tr_data_enhance=10, val_data_enhance=1, seed=3, version='vj-sim3_10.101', use_focal_loss=False, weight_func=weight_func,
                                    only_save=False)
        gc.collect()
        print(score) # 删第一块数据
        print(np.mean(score,axis=0)) # score, weighted, enhance=0, score
    #my_final_sub(y_test1, output_name=f'./submission/baseline_simv2_k5_101_kf_all_tune.zip')

In [None]:
kf_n_splits = 20
kf = kfold_by_uid(df, n_splits=kf_n_splits, weight='uid-length')
metric = lambda y,y_hat,*args,**argvs: roc_auc_score(y,y_hat)

# 生成数据
score, y_test1 = try_predict(df, columns, 'real_response', lgb_params, kf, [roc_auc_score], df_test=df_test, 
                             debug=0, is_try=0, verbose=1, without_columns=without_columns, with_columns=kf0, 
                             tr_data_enhance=20, val_data_enhance=1, seed=3, version='vj-sim3_20.201', use_focal_loss=False, weight_func=weight_func,
                            only_save=True)

for with_columns in [kf0,kf1,kf9]:
    for lgb_params in [lgb_params_jiangxing1227, lgb_params_jiangxing1230gbdt]:
        gc.collect()
        # 模型训练
        score, y_test1 = try_predict(df, columns, 'real_response', lgb_params, kf, [roc_auc_score], df_test=df_test, 
                                     debug=0, is_try=0, verbose=1, without_columns=without_columns, with_columns=with_columns, 
                                     tr_data_enhance=20, val_data_enhance=1, seed=3, version='vj-sim3_20.201', use_focal_loss=False, weight_func=weight_func,
                                    only_save=False)
        gc.collect()
        print(score) # 删第一块数据
        print(np.mean(score,axis=0)) # score, weighted, enhance=0, score
        #my_final_sub(y_test1, output_name=f'./submission/baseline_simv2_k5_101_kf_all_tune.zip')

## 融合

In [None]:
y_test = None
for file in os.listdir('output'):
    tmp = np.array(pd.read_feather('output/'+file)['pred'])
    if y_test is None:
        y_test = np.zeros_like(tmp)
    y_test += tmp
my_final_sub(y_test, output_name=f'./submission/ensemble_final.zip')