In [1]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc  
from datetime import datetime

warnings.filterwarnings('ignore')
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
seed = 2020

In [3]:
df_train = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
df_test = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
df_sub = pd.read_csv('./data/used_car_sample_submit.csv', sep=' ')

In [4]:
df_feature = pd.concat([df_train, df_test], sort=False)

In [5]:
df_feature.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [6]:
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].replace(
    '-', 2)
df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].astype(
    'float')

In [7]:
df_feature.tail()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
49995,249995,111443,20041005,4.0,4,0.0,,1.0,150,15.0,...,0.263668,0.000292,0.141804,0.076393,0.039272,2.072901,-2.531869,1.716978,-1.063437,0.326587
49996,249996,152834,20130409,65.0,1,0.0,0.0,0.0,179,4.0,...,0.25531,0.000991,0.155868,0.108425,0.067841,1.358504,-3.290295,4.269809,0.140524,0.556221
49997,249997,132531,20041211,4.0,4,0.0,0.0,1.0,147,12.5,...,0.262933,0.000318,0.141872,0.071968,0.042966,2.165658,-2.417885,1.370612,-1.073133,0.270602
49998,249998,143405,20020702,40.0,1,4.0,0.0,1.0,176,15.0,...,0.282106,2.3e-05,0.067483,0.067526,0.009006,2.030114,-2.939244,0.569078,-1.718245,0.316379
49999,249999,78202,20090708,32.0,8,1.0,0.0,0.0,0,3.0,...,0.231449,0.103947,0.096027,0.062328,0.11018,-3.68909,2.032376,0.109157,2.202828,0.847469


# feature engine

In [8]:
del df_feature['seller']
del df_feature['offerType']

In [9]:
df_feature['price'] = np.log1p(df_feature['price'])

In [10]:
df_feature['name_count'] = df_feature.groupby(
    ['name'])['SaleID'].transform('count')

In [11]:
def date_parse(x):
    year = int(str(x)[:4])
    month = int(str(x)[4:6])
    day = int(str(x)[6:8])

    if month < 1:
        month = 1

    date = datetime(year, month, day)
    return date


df_feature['regDate'] = df_feature['regDate'].apply(date_parse)
df_feature['creatDate'] = df_feature['creatDate'].apply(date_parse)
df_feature['regDate_year'] = df_feature['regDate'].dt.year

In [12]:
df_feature['car_age_day'] = (df_feature['creatDate'] - df_feature['regDate']).dt.days
df_feature['car_age_year'] = round(df_feature['car_age_day'] / 365, 1)

In [13]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge

In [14]:
def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['model'], {'price': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['regionCode'], {'price': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['name'], {'price': ['mean']})

    return df_unknow

In [15]:
# 5折交叉
df_train = df_feature[~df_feature['price'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['price'].isnull()]

df_stas_feat = None
kf = KFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train):
    print(train_index, val_index)
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

[     0      1      2 ... 149994 149995 149998] [     3      7     12 ... 149996 149997 149999]
[     1      3      4 ... 149997 149998 149999] [     0      2      5 ... 149973 149974 149991]
[     0      1      2 ... 149996 149997 149999] [     4     14     15 ... 149978 149982 149998]
[     0      1      2 ... 149997 149998 149999] [    11     13     39 ... 149992 149994 149995]
[     0      2      3 ... 149997 149998 149999] [     1      6      9 ... 149986 149987 149988]


13

In [16]:
df_feature

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_12,v_13,v_14,name_count,regDate_year,car_age_day,car_age_year,model_price_mean,regionCode_price_mean,name_price_mean
0,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,...,-0.501868,-2.438353,-0.478699,2,1996,7125,19.5,9.063339,8.013673,8.682877
1,7,165346,1999-07-06,26.0,14,1.0,0.0,0.0,101,15.0,...,-2.192810,0.236728,0.195567,1,1999,6108,16.7,7.566696,8.281427,
2,12,120103,2001-03-07,48.0,14,1.0,0.0,0.0,58,6.0,...,-1.442835,0.659255,1.199350,1,2001,5493,15.0,7.092135,8.661780,
3,16,10036,2011-09-11,105.0,1,0.0,1.0,1.0,239,12.5,...,3.086576,0.165461,-2.192635,16,2011,1638,4.5,9.912501,9.273880,9.242547
4,23,8949,1994-04-01,78.0,7,5.0,0.0,0.0,105,15.0,...,-3.580304,0.157992,-1.133201,13,1994,8021,22.0,7.744373,8.087276,6.680958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,249995,111443,2004-10-05,4.0,4,0.0,,1.0,150,15.0,...,1.716978,-1.063437,0.326587,1,2004,4173,11.4,8.243772,9.055739,
49996,249996,152834,2013-04-09,65.0,1,0.0,0.0,0.0,179,4.0,...,4.269809,0.140524,0.556221,1,2013,1079,3.0,8.528968,9.321724,
49997,249997,132531,2004-12-11,4.0,4,0.0,0.0,1.0,147,12.5,...,1.370612,-1.073133,0.270602,1,2004,4113,11.3,8.243772,8.520300,
49998,249998,143405,2002-07-02,40.0,1,4.0,0.0,1.0,176,15.0,...,0.569078,-1.718245,0.316379,1,2002,5017,13.7,8.364875,7.844220,


In [17]:
df_feature.corr()

Unnamed: 0,SaleID,name,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,v_12,v_13,v_14,name_count,regDate_year,car_age_day,car_age_year,model_price_mean,regionCode_price_mean,name_price_mean
SaleID,1.0,-0.000308,0.003596,0.000407,-0.00218,-0.000364,0.002636,0.001235,-0.000341,-0.00107,...,-0.000389,0.000566,-0.002554,0.000627,-0.002133,0.002253,0.002262,-0.000688,-0.002323,-0.002688
name,-0.000308,1.0,0.016435,0.040027,0.035365,0.015224,0.02425,0.000694,-0.0089,0.124767,...,0.09119,0.008303,-0.013102,-0.340026,-0.036279,0.03751,0.037504,-0.027106,-0.027026,0.027547
model,0.003596,0.016435,1.0,0.356408,0.206247,0.047526,0.020083,0.029609,-0.09076,-0.056718,...,0.110729,0.401318,-0.514669,-0.107819,0.149896,-0.150477,-0.150481,0.222911,0.023601,0.126036
brand,0.000407,0.040027,0.356408,1.0,0.114821,-0.082136,0.012426,-0.02174,-0.098754,-0.001863,...,-0.071192,0.320553,-0.206508,-0.091953,0.032865,-0.032337,-0.032338,-0.123694,-0.014845,-0.122389
bodyType,-0.00218,0.035365,0.206247,0.114821,1.0,0.118804,0.100395,0.076539,-0.054117,-0.027973,...,0.210932,-0.030501,-0.290143,-0.109139,0.103794,-0.103381,-0.103376,0.302237,0.026761,0.265204
fuelType,-0.000364,0.015224,0.047526,-0.082136,0.118804,1.0,0.173167,0.060912,0.100393,-0.037536,...,0.302697,-0.051863,-0.021138,-0.094738,0.276949,-0.278837,-0.278841,0.256997,0.02942,0.321771
gearbox,0.002636,0.02425,0.020083,0.012426,0.100395,0.173167,1.0,0.147592,-0.046295,-0.055775,...,0.291592,-0.202967,-0.045832,-0.082936,0.143757,-0.144997,-0.145006,0.293501,0.061665,0.322959
power,0.001235,0.000694,0.029609,-0.02174,0.076539,0.060912,0.147592,1.0,-0.018015,-0.072367,...,0.154917,-0.099573,-0.02397,-0.031047,0.093677,-0.094725,-0.094712,0.175555,0.032163,0.284246
kilometer,-0.000341,-0.0089,-0.09076,-0.098754,-0.054117,0.100393,-0.046295,-0.018015,1.0,0.106561,...,-0.368241,-0.286175,-0.116372,0.045511,-0.491682,0.489797,0.489807,-0.104095,-0.084853,-0.323568
notRepairedDamage,-0.00107,0.124767,-0.056718,-0.001863,-0.027973,-0.037536,-0.055775,-0.072367,0.106561,1.0,...,-0.141132,-0.073436,-0.039033,-0.070507,-0.223307,0.229171,0.229117,-0.148322,-0.057906,-0.301952


# model

In [18]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|                                                                                       | 0/200000 [00:00<?, ?it/s]


In [19]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [20]:
df_train

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_12,v_13,v_14,name_count,regDate_year,car_age_day,car_age_year,model_price_mean,regionCode_price_mean,name_price_mean
0,3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,...,-0.501868,-2.438353,-0.478699,2,1996,7125,19.5,9.063339,8.013673,8.682877
1,7,165346,1999-07-06,26.0,14,1.0,0.0,0.0,101,15.0,...,-2.192810,0.236728,0.195567,1,1999,6108,16.7,7.566696,8.281427,
2,12,120103,2001-03-07,48.0,14,1.0,0.0,0.0,58,6.0,...,-1.442835,0.659255,1.199350,1,2001,5493,15.0,7.092135,8.661780,
3,16,10036,2011-09-11,105.0,1,0.0,1.0,1.0,239,12.5,...,3.086576,0.165461,-2.192635,16,2011,1638,4.5,9.912501,9.273880,9.242547
4,23,8949,1994-04-01,78.0,7,5.0,0.0,0.0,105,15.0,...,-3.580304,0.157992,-1.133201,13,1994,8021,22.0,7.744373,8.087276,6.680958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,149984,108738,2005-04-12,13.0,4,2.0,1.0,1.0,272,15.0,...,1.916779,-1.170598,0.452177,1,2005,4000,11.0,8.581337,8.562204,
29996,149985,60479,2002-10-12,103.0,1,1.0,0.0,0.0,110,12.5,...,-0.431625,0.923764,-1.684396,8,2002,4914,13.5,8.189292,8.619392,8.112901
29997,149986,85675,2001-10-03,4.0,4,2.0,1.0,0.0,150,15.0,...,0.049954,-1.536221,-0.169248,1,2001,5280,14.5,8.240178,7.939571,
29998,149987,191380,2008-07-03,66.0,9,0.0,1.0,0.0,125,12.5,...,0.201855,1.749289,0.249023,1,2008,2822,7.7,7.554825,8.287912,


In [21]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate'], df_train.columns))
feature_names

['name',
 'model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'power',
 'kilometer',
 'notRepairedDamage',
 'regionCode',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'name_count',
 'regDate_year',
 'car_age_day',
 'car_age_year',
 'model_price_mean',
 'regionCode_price_mean',
 'name_price_mean']

In [22]:

model = lgb.LGBMRegressor(num_leaves=64,
                          max_depth=10,
                          learning_rate=0.1,
                          n_estimators=10000000,
                          subsample=0.8,
                          feature_fraction=0.8,
                          reg_alpha=0.5,
                          reg_lambda=0.5,
                          random_state=seed,
                          metric=None
                          )


oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=False, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='mae',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[500]	train's l1: 0.0907441	valid's l1: 0.121406
[1000]	train's l1: 0.0716917	valid's l1: 0.117986
[1500]	train's l1: 0.05926	valid's l1: 0.116149
[2000]	train's l1: 0.0500241	valid's l1: 0.115019
[2500]	train's l1: 0.0432229	valid's l1: 0.114271
[3000]	train's l1: 0.0378985	valid's l1: 0.11364
[3500]	train's l1: 0.0336443	valid's l1: 0.113246
[4000]	train's l1: 0.030159	valid's l1: 0.112859
[4500]	train's l1: 0.0272253	valid's l1: 0.112656
[5000]	train's l1: 0.0247917	valid's l1: 0.112488
[5500]	train's l1: 0.0227171	valid's l1: 0.112284
Early stopping, best iteration is:
[5854]	train's l1: 0.0214363	valid's l1: 0.11216


Training until validation scores don't improve for 50 rounds
[500]	train's l1: 0.0905843	valid's l1: 0.120469
[1000]	train's l1: 0.0716586	valid's l1: 0.116973
[1500]	train's l1: 0.0591194	valid's l1: 0.115365
[2000]	train's l1: 0.0500232	valid's l1: 0.114221
[2500]	train's l1: 0.0431331	valid's l1: 0.113

In [23]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,regionCode_price_mean,18804.8
1,regionCode,18127.8
2,name_price_mean,15155.8
3,v_3,11217.0
4,v_1,10838.0
5,v_8,10805.2
6,v_11,10743.4
7,v_14,10619.0
8,car_age_day,10604.8
9,v_0,10476.4


In [24]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)

mae: 490.5281773679076


In [25]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('./{}.csv'.format(mae), index=False, encoding='utf-8')