## 工具导入

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import gc
from collections import Counter
import copy

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## 数据读取

In [3]:
# 读取数据集
test_data = pd.read_csv("../data/test_format1.csv")
train_data = pd.read_csv("../data/train_format1.csv")
user_info = pd.read_csv("../data/user_info_format1.csv")
user_log = pd.read_csv("../data/user_log_format1.csv")

## 数据压缩

### 定义内存压缩方法

In [4]:
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max > np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max > np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:1f}%'.format(100*(start_mem - end_mem)/start_mem))

    return df


### 内存压缩

In [5]:
num_rows = None
train_file = '../data/train_format1.csv'
test_file = '../data/test_format1.csv'
user_info_file = '../data/user_info_format1.csv'
user_log_file = '../data/user_log_format.csv'

train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)

user_info = reduce_mem_usage(user_info)
user_log = reduce_mem_usage(user_log)

Memory usage after optimization is: 2.74 MB
Decreased by 54.165525%
Memory usage after optimization is: 4.49 MB
Decreased by 24.999474%
Memory usage after optimization is: 4.85 MB
Decreased by 49.999352%
Memory usage after optimization is: 1309.52 MB
Decreased by 55.357140%


### 查看压缩后的数据信息

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int64
 1   merchant_id  260864 non-null  int16
 2   label        260864 non-null  int8 
dtypes: int16(1), int64(1), int8(1)
memory usage: 2.7 MB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      261477 non-null  int64  
 1   merchant_id  261477 non-null  int16  
 2   prob         0 non-null       float64
dtypes: float64(1), int16(1), int64(1)
memory usage: 4.5 MB


In [8]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int64  
 1   age_range  421953 non-null  float16
 2   gender     417734 non-null  float16
dtypes: float16(2), int64(1)
memory usage: 4.9 MB


In [9]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int64(2), int8(1)
memory usage: 1.3 GB


## 数据处理

### 合并用户信息

In [10]:
all_data = train_data._append(test_data)
all_data = all_data.merge(user_info, on=['user_id'], how='left')

del train_data, test_data, user_info
gc.collect()

78

### 将用户行为日志信息按照时间排序

In [11]:
"""
按照userid和时间戳排序
"""
user_log = user_log.sort_values(['user_id', 'time_stamp'])

In [12]:
user_log.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54925330 entries, 23288890 to 13710715
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int16  
 3   seller_id    int16  
 4   brand_id     float16
 5   time_stamp   int16  
 6   action_type  int8   
dtypes: float16(1), int16(3), int64(2), int8(1)
memory usage: 1.7 GB


In [13]:
"""
对每个用户逐个合并所有字段，合并字段为item_id,cat_id,seller_id,brand_id,time_stamp,action_type
"""
# 将列表中的元素转换为字符串，并以空格分隔后连接起来。
list_join_fun = lambda x: " ".join([str(i) for i in x])

agg_dict = {
    'item_id': list_join_fun,
    'cat_id': list_join_fun,
    'seller_id': list_join_fun,
    'brand_id': list_join_fun,
    'time_stamp': list_join_fun,
    'action_type': list_join_fun
}

rename_dict = {
    'item_id': 'item_path',
    'cat_id': 'cat_path',
    'seller_id': 'seller_path',
    'brand_id': 'brand_path',
    'time_stamp': 'time_stamp_path',
    'action_type': 'action_type_path'
}

def merge_list(df_ID, join_columns, df_data, agg_dict, rename_dict):
    df_data = df_data.\
        groupby(join_columns).\
        agg(agg_dict).\
        reset_index().\
        rename(columns=rename_dict)
    df_ID = df_ID.merge(df_data, on=join_columns, how='left')
    return df_ID

all_data = merge_list(all_data, 'user_id', user_log, agg_dict, rename_dict)

### 删除数据回收内存

In [14]:
del user_log
gc.collect()

0

## 定义特征统计函数

### 定义统计函数

In [15]:
"""
定义统计数据总数的函数
"""
def cnt_(x):
    try:
        return len(x.split(' '))
    except:
        return -1

In [16]:
"""
定义统计数据唯一值总数的函数
"""
def nunique_(x):
    try:
        return len(set(x.split(' ')))
    except:
        return -1

In [17]:
"""
定义统计数据最大值的函数
"""
def max_(x):
    try:
        return np.max([float(i) for i in x.split(' ')])
    except:
        return -1

In [18]:
"""
定义统计数据最小值的函数
"""
def min_(x):
    try:
        return np.min([float(i) for i in x.split(' ')])
    except:
        return -1

In [19]:
"""
定义统计数据标准差的函数
"""
def std_(x):
    try:
        return np.std([float(i) for i in x.split(' ')])
    except:
        return -1

In [36]:
"""
定义统计数据中topN的数据的函数
"""
def most_n(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][0]
    except:
        return -1

In [21]:
"""
定义统计数据中topN数据总数的函数
"""
def most_n_cnt(x, n):
    try:
        return Counter(x.split(' ')).most_common(n)[n-1][1]
    except:
        return -1

### 调用定义的统计函数

In [37]:
def user_cnt(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(cnt_)
    return df_data

def user_nunique(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(nunique_)
    return df_data

def user_max(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(max_)
    return df_data

def user_min(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(min_)
    return df_data

def user_std(df_data, single_col, name):
    df_data[name] = df_data[single_col].apply(std_)
    return df_data

def user_most_n(df_data, single_col, name, n=1):
    func = lambda x : most_n(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

def user_most_n_cnt(df_data, single_col, name, n=1):
    func = lambda x : most_n_cnt(x, n)
    df_data[name] = df_data[single_col].apply(func)
    return df_data

## 提取统计特征

### 特征统计

店铺特征统计

In [27]:
"""
提取基本统计特征
"""
all_data_test = all_data.head(2000)
# 统计用户点击、浏览、加购、购买行为
# 总次数
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')

In [28]:
# 不同店铺的次数
all_data_test = user_nunique(all_data_test, 'seller_path', 'seller_nunique')
# 不同品类的个数
all_data_test = user_nunique(all_data_test, 'cat_path', 'cat_nunique')
# 不同品牌的个数
all_data_test = user_nunique(all_data_test, 'brand_path', 'brand_nunique')
# 不同商品的个数
all_data_test = user_nunique(all_data_test, 'item_path', 'item_nunique')
# 活跃天数
all_data_test = user_nunique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')
# 不同用户行为种数
all_data_test = user_nunique(all_data_test, 'action_type_path', 'action_type_nunique')

In [31]:
# 最晚时间
all_data_test = user_max(all_data_test, 'action_type_path', 'time_stamp_max')
# 最早时间
all_data_test = user_min(all_data_test, 'action_type_path', 'time_stamp_min')
# 活跃天数方差
all_data_test = user_std(all_data_test, 'action_type_path', 'time_stamp_std')
# 最晚和最早相差天数
all_data_test['time_stamp_range'] = all_data_test['time_stamp_max'] - all_data_test['time_stamp_min']

In [38]:
# 用户最喜欢的店铺
all_data_test = user_most_n(all_data_test, 'seller_path', 'seller_most_1', n=1)
# 最喜欢的类目
all_data_test = user_most_n(all_data_test, 'cat_path', 'cat_most_1', n=1)
# 最喜欢的品牌
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)
# 最常见的行为动作
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', n=1)

In [39]:
# 用户最喜欢的店铺的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)
# 用户最喜欢的类目的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)
# 用户最喜欢的品牌的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)
# 用户最常见的行为动作的行为次数
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_1_cnt', n=1)

用户特征统计

In [40]:
# 对点击、加购、购买、收藏分开统计
def col_cnt_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        col_list = copy.deepcopy(columns_list)
        if action_type != None:
            col_list += ['action_type_path']
        
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
        
        path_len = len(data_dict[col])
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in col_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
        
        return len(data_out)
    except:
        return -1

In [41]:
def col_nunique_(df_data, columns_list, action_type):
    try:
        data_dict = {}
        
        col_list = copy.deepcopy(columns_list)
        if action_type is not None:
            col_list += ['action_type_path']
            
        for col in col_list:
            data_dict[col] = df_data[col].split(' ')
        
        path_len = len(data_dict[col])
        
        data_out = []
        for i_ in range(path_len):
            data_txt = ''
            for col_ in columns_list:
                if data_dict['action_type_path'][i_] == action_type:
                    data_txt += '_' + data_dict[col_][i_]
            data_out.append(data_txt)
            
        return len(set(data_out))
    except:
        return -1

In [42]:
def user_col_cnt(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
    return df_data

In [43]:
def user_col_nunique(df_data, columns_list, action_type, name):
    df_data[name] = df_data.apply(lambda x: col_nunique_(x, columns_list, action_type), axis=1)
    return df_data

统计用户和店铺的关系

In [45]:
# 点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')
# 架构次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')

### 特征组合

In [46]:
# 点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path', 'item_path'], '0', 'user_cnt_0')
# 不同店铺数量
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nunique_0')

In [48]:
# 查看提取的特征
all_data_test.columns

Index(['user_id', 'merchant_id', 'label', 'prob', 'age_range', 'gender',
       'item_path', 'cat_path', 'seller_path', 'brand_path', 'time_stamp_path',
       'action_type_path', 'user_cnt', 'seller_nunique', 'cat_nunique',
       'brand_nunique', 'item_nunique', 'time_stamp_nunique',
       'action_type_nunique', 'time_stamp_max', 'time_stamp_min',
       'time_stamp_std', 'time_stamp_range', 'seller_most_1', 'cat_most_1',
       'brand_most_1', 'action_type_1', 'seller_most_1_cnt', 'cat_most_1_cnt',
       'brand_most_1_cnt', 'action_type_1_cnt', 'user_cnt_0', 'user_cnt_1',
       'user_cnt_2', 'user_cnt_3', 'seller_nunique_0'],
      dtype='object')

## 利用CountVector和TF-IDF提取特征

In [51]:
"""
利用CountVector和TF-IDF提取特征
"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# 书上这个位置过时了
# from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy import sparse

In [54]:
tfidfVec = TfidfVectorizer(stop_words='english',
                           ngram_range=(1, 1),
                           max_features=100)
column_list = ['seller_path']
for i, col in enumerate(column_list):
    tfidfVec.fit(all_data_test[col])
    data_ = tfidfVec.transform(all_data_test[col])
    if i == 0:
        data_cat = data_
    else:
        data_cat = sparse.hstack((data_cat, data_))

In [55]:
"""
特征重命名和特征合并
"""
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf], axis=1)

## 嵌入特征

In [56]:
import gensim

In [57]:
# 训练Word2Vec模型
model = gensim.models.Word2Vec(
    all_data_test['seller_path'].apply(lambda x: x.split(' ')),
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)

# model.save("product2vec.model")
# model = gensim .models.Word2Vec.load("product2vec.model")

In [58]:
def mean_w2v_(x, model, size=100):
    try:
        i=0
        for word in x.split(' '):
            if word in model.wv.vocab:
                i += 1
                if i == 1:
                    vec = np.zeros(size)
                vec += model.wv[word]
        return vec / i
    except:
        return np.zeros(size)

In [61]:
def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v_(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

In [62]:
df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]

In [63]:
# 嵌入特征和原始特征合并
all_data_test = pd.concat([all_data_test, df_embeeding], axis=1)

## Stacking分类特征

### Stacking特征工具包

In [67]:
from sklearn.model_selection import KFold

import xgboost
import lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB, GaussianNB

### 定义Stacking分类特征相关函数

In [1]:
def stacking_clf(clf, train_x, train_y, test_x, clf_name, kf, label_split=None, folds=5):
    
    train = np.zeros((train_x.shape[0], 1))
    test = np.zeros((test_x.shape[0], 1))
    test_pre = np.empty((folds, test_x.shape[0], 1))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf.split(train_x, label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
        
        if clf_name in ['rf', 'ada', 'gb', 'et', 'lr', 'knn', 'gnb']:
            clf.fit(tr_x, tr_y)
            pre = clf.predict_proba(te_x)
            
            train[test_index] = pre[:, 0].reshape(-1, 1)
            test_pre[i, :] = clf.predict_proba(test_x)[:, 0].reshape(-1, 1)
            
            cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
            
        elif clf_name in ['xgb']:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {
                'booster': 'gbtree',
                'objective': 'multi:softprob',
                'eval_metric': 'mlogloss',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'extract',
                'seed': 2017,
                'num_class': 2
            }
            
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(params, 
                                  num_boost_round=num_round,
                                  evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds)
                pre = model.predict(test_matrix,
                                    ntree_limit=model.best_ntree_limit)
                train[test_index] = pre[:, 0].reshape(-1, 1)
                test_pre[i, :] = model.predict(
                    z,
                    ntree_limit=model.best_ntree_limit)[:, 0].reshape(-1, 1)
                cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
                
        elif clf_name in ['lgb']:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                'boosting_type':'gbdt',
                # 'boosting_type': 'dart',
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_12':10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'exact',
                'seed': 2017,
                'num_class': 2,
                'silent': True
            }
            # 下面的模型说不能使用early_stopping_rounds，因此我把num_round改成了10000，然后删除了early_stopping_rounds
            num_round = 1000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params,
                                  train_matrix,
                                  num_boost_round=num_round,
                                  early_stopping_rounds=early_stopping_rounds,
                                  valid_sets=test_matrix)
                pre = model.predict(te_x, num_iteration=model.best_iteration)
                train[test_index] = pre[:,0].reshape(-1,1)
                test_pre[i, :] = model.predict(
                    test_x,
                    num_iteraion=model.best_iteration)[:, 0].reshape(-1, 1)
            cv_scores.append(log_loss(te_y, pre[:, 0].reshape(-1, 1)))
        else:
            raise IOError("Please add new clf.")
        print('%s now score is :' % clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    print('%s_score_list:' % clf_name, cv_scores)
    print('%s_score_mean:' % clf_name, np.mean(cv_scores))
    
    return train.reshape(-1, 1), test.reshape(-1, 1)

In [70]:
"""
随机森林分类器
"""
def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
    randomforest = RandomForestClassifier(n_estimators=1200,
                                          max_depth=20,
                                          n_jobs=-1,
                                          random_state=2017,
                                          max_features='auto',
                                          verbose=1)
    
    rf_train, rf_test = stacking_clf(randomforest,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'rf',
                                     kf,
                                     label_split=label_split)
    return rf_train, rf_test, 'rf'

In [71]:
"""
ADABoost分类器
"""
def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
    
    adaboost = AdaBoostClassifier(n_estimators=50,
                                  random_state=2017,
                                  learning_rate=0.01)
    
    ada_train, ada_test = stacking_clf(adaboost,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'ada',
                                       kf,
                                       label_split=label_split)
    return ada_train, ada_test, 'ada'

In [72]:
"""
gbdt分类器
"""
def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gbdt = GradientBoostingClassifier(learning_rate=0.04,
                                      n_estimators=100,
                                      subsample=0.8,
                                      random_state=2017,
                                      max_depth=5,
                                      verbose=1)
    gbdt_train, gbdt_test = stacking_clf(gbdt,
                                         x_train,
                                         y_train,
                                         x_valid,
                                         'gb',
                                         kf,
                                         label_split=label_split)
    return gbdt_train, gbdt_test, 'gb'

In [73]:
"""
extraTree分类器
"""
def et_clf(x_train, y_train, x_valid, kf, label_split=None):
    extratree = ExtraTreesClassifier(n_estimators=1200,
                                     max_depth=35,
                                     max_features='auto',
                                     n_jobs=-1,
                                     random_state=2017,
                                     verbose=1)
    et_train, et_test = stacking_clf(extratree,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'et',
                                     kf,
                                     label_split=label_split)
    return et_train, et_test, 'et'

In [74]:
"""
xgb分类器
"""
def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    xgb_train, xgb_test = stacking_clf(xgboost,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'xgb',
                                       kf,
                                       label_split=label_split)
    return xgb_train, xgb_test, 'xgb'

In [75]:
"""
lgb分类器
"""
def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_clf(lightgbm,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'lgb',
                                       kf,
                                       label_split=label_split)
    return lgb_train, lgb_test, 'lgb'

In [76]:
"""
高斯朴素贝叶斯分类器
"""
def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
    gnb = GaussianNB()
    gnb_train, gnb_test =stacking_clf(gnb,
                                      x_train,
                                      y_train,
                                      x_valid,
                                      'gnb',
                                      kf,
                                      label_split=label_split)
    return gnb_train, gnb_test, 'gnb'

In [77]:
"""
线性回归
"""
def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
    logisticRegression = LogisticRegression(n_jobs=-1,
                                            random_state=2017,
                                            C=0.01,
                                            max_iter=200)
    lr_train, lr_test = stacking_clf(logisticRegression,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'lr',
                                     kf,
                                     label_split=label_split)
    return lr_train, lr_test, 'lr'

In [78]:
"""
KNN分类器
"""
def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
    kneighbors = KNeighborsClassifier(n_neighbors=200, n_jobs=-1)
    knn_train, knn_test = stacking_clf(kneighbors,
                                       x_train,
                                       y_train,
                                       x_valid,
                                       'lr',
                                       kf,
                                       label_split=label_split)
    return knn_train, knn_test, 'knn'

### 读取训练数据和验证数据

In [79]:
feature_columns = [
    c for c in all_data_test.columns if c not in [
        'label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path'
    ]
]

x_train = all_data_test[
    ~all_data_test['label'].isna()][feature_columns].values
y_train = all_data_test[
    ~all_data_test['label'].isna()]['label'].values
x_valid = all_data_test[
    all_data_test['label'].isna()][feature_columns].values

In [80]:
def get_matrix(data):
    where_are_nan = np.isnan(data)
    where_are_inf = np.isinf(data)
    data[where_are_inf] = 0
    data[where_are_nan] = 0
    return data

x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = x_train

### 使用lgb和xgb分类模型构造Stacking特征

In [81]:
# 使用5折交叉验证
from sklearn.model_selection import StratifiedKFold, KFold

folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [82]:
# 选择lgb和xgb分类模型作为基本模型

clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']

In [2]:
# 获取stacking特征

column_list = []
train_data_list = []
test_data_list = []

for clf in clf_list:
    train_data, test_data, clf_name = clf(x_train, y_train, x_valid, kf, label_split=None)
    train_data_list.append(train_data)
    test_data_list.append(test_data)

train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)

NameError: name 'clf_list' is not defined

### 原始特征和Stacking特征合并

In [98]:
# 合并特征
train = pd.DataFrame(np.concatenate([x_train, train_stacking], axis=1))
test = np.concatenate([x_valid, test_stacking], axis=1)

NameError: name 'train_stacking' is not defined

In [ ]:
# 特征重命名
df_train_all = pd.DataFrame(train)
df_train_all.columns = features_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = feature_columns + clf_list_col

In [85]:
# 获取数据ID及特征标签label
df_train_all['user_id'] = all_data_test[~all_data_test['label'].isna()]['user_id']
df_test_all['user_id'] = all_data_test[all_data_test['label'].isna()]['user_id'] 
df_train_all['label'] = all_data_test[~all_data_test['label'].isna()]['label']

NameError: name 'df_train_all' is not defined

### 保存特征

In [ ]:
df_train_all.to_csv('train_all.csv', header=True, index=False)
df_test_all.to_csv('test_all.csv', header=True, index=False)