In [1]:
import pandas as pd
import numpy as np
import datetime 

### 读取数据

In [2]:
User = pd.read_csv('./Data/JData_User.csv',encoding='GBK',na_values=-1,parse_dates=['user_reg_dt'])

In [3]:
Action_201604 = pd.read_csv('./Data/JData_Action_201604.csv',\
                            encoding='GBK',\
                            na_values=[-1,'NA'],\
                            parse_dates=['time'],dtype={'user_id':'int','sku_id':'int','type':'int','model_id':'float'})

In [4]:
Action_201604 = Action_201604.sort_values(['user_id','sku_id','time'])\
    .drop_duplicates(['user_id','sku_id','time'])

In [5]:
Action_clean = Action_201604.merge(pd.get_dummies(Action_201604.type,prefix='type'),\
                                    left_index=True,
                                    right_index=True,
                                    how='left')

In [6]:
Comments = pd.read_csv('./Data/JData_Comment.csv',encoding='GBK',na_values=[-1,'NA'],\
                       parse_dates=['dt'],dtype={'sku_id':'int'})

In [7]:
Product = pd.read_csv('./Data/JData_Product.csv',encoding='GBK',na_values=-1,\
                     dtype={'sku_id':'int'})

### 构建Train_X

In [8]:
date0 = pd.to_datetime('2016-04-01')
date1 = pd.to_datetime('2016-04-06')
date2 = pd.to_datetime('2016-04-11')
date3 = pd.to_datetime('2016-04-16')

In [9]:
Action_X = Action_clean.groupby(['user_id','sku_id'])\
.agg({'type_1':sum,'type_2':sum,'type_3':sum,\
                    'type_4':sum,'type_5':sum,'type_6':sum,\
                    'time':min,'model_id':len
                   }).reset_index()

In [10]:
Action_X['day'] = pd.to_datetime(Action_X['time'].dt.date)

In [11]:
def Train_X(date_a,date_b):
    
    Action_section = Action_X[(Action_X['time'] >=  date_a) & (Action_X['time'] < date_b)]
    User_section =  User[User['user_reg_dt'] < date_b]
    UA_section = Action_section.merge(User_section,on='user_id',how='left')\
    .sort_values(['user_id','sku_id','time'])
    
    Comments_section =  Comments[Comments['dt'] < date_b]
    Comments_section = Comments_section.sort_values(['sku_id','dt'],ascending=False)\
    .drop_duplicates(['sku_id'])
    
    PC_section = Product.merge(Comments_section,how='left',on='sku_id')\
    .sort_values('bad_comment_rate',ascending=False)

    UAPC_section = PC_section.merge(UA_section,on='sku_id',how='left')
    UAPC_section = UAPC_section.dropna(axis=0,subset=['user_id'])
    
    UAPC_section[['user_id','sku_id']] = UAPC_section[['user_id','sku_id']]\
    .apply(lambda x:x.astype(int))
    
    return UAPC_section

### 构建Label

In [12]:
def Train_y(date_a,date_b):
    Action_section = Action_clean[(Action_clean['time'] >= date_a) & (Action_clean['time'] < date_b)]
    Action_y = Action_section[Action_section['type_4']==1][['user_id','sku_id','type_4','time']]\
    .rename({'type_4':'label'})
    train_y =Action_y.merge(Product,on='sku_id',how='right')[['user_id','sku_id','type_4']]
    train_y = train_y.groupby(['user_id','sku_id']).apply(len).to_frame().reset_index()
    train_y.columns = ['user_id','sku_id','label']
    return train_y

### 构造Train数据集

In [13]:
def Train_create(date_a,date_b,date_c):
    train_X = Train_X(date_a,date_b)
    train_y = Train_y(date_b,date_c)
    train = train_X.merge(train_y,on=['user_id','sku_id'],how='left')
    train['label']= train['label'].fillna(0)
    train['duration'] = (date3 - train['day'])/np.timedelta64(1, 'D')
    return train

In [14]:
train = Train_create(date0,date1,date2)

In [15]:
train['label'].value_counts()

0.0    237672
1.0       450
Name: label, dtype: int64

### 导出数据

In [16]:
train = train.fillna(-1)

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238122 entries, 0 to 238121
Data columns (total 26 columns):
sku_id              238122 non-null int64
attr1               238122 non-null float64
attr2               238122 non-null float64
attr3               238122 non-null float64
cate                238122 non-null int64
brand               238122 non-null int64
dt                  238122 non-null datetime64[ns]
comment_num         238122 non-null float64
has_bad_comment     238122 non-null float64
bad_comment_rate    238122 non-null float64
user_id             238122 non-null int64
type_1              238122 non-null float64
type_2              238122 non-null float64
type_3              238122 non-null float64
type_4              238122 non-null float64
type_5              238122 non-null float64
type_6              238122 non-null float64
time                238122 non-null datetime64[ns]
model_id            238122 non-null float64
day                 238122 non-null datetime64[

In [18]:
train.to_csv('./Data/jdata_train.csv',na_rep=-1,index=False)