In [1]:
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
import lightgbm as lgb
import time
import pandas as pd
import numpy as np
import sys

#### 1、Import Date

In [None]:
train = pd.read_csv('round1_iflyad_train.txt',sep='\t')
test = pd.read_csv('round1_iflyad_test_feature.txt',sep='\t')

#### 2、Data Exploration

In [156]:
train.head(5)
train.info()
train.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001650 entries, 0 to 1001649
Data columns (total 35 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   instance_id            1001650 non-null  int64  
 1   time                   1001650 non-null  int64  
 2   city                   1001650 non-null  int64  
 3   province               1001650 non-null  int64  
 4   user_tags              691880 non-null   object 
 5   carrier                1001650 non-null  int64  
 6   devtype                1001650 non-null  int64  
 7   make                   902733 non-null   object 
 8   model                  994248 non-null   object 
 9   nnt                    1001650 non-null  int64  
 10  os                     1001650 non-null  int64  
 11  osv                    993878 non-null   object 
 12  os_name                1001650 non-null  object 
 13  adid                   1001650 non-null  int64  
 14  advert_id         

instance_id                   0
time                          0
city                          0
province                      0
user_tags                309770
carrier                       0
devtype                       0
make                      98917
model                      7402
nnt                           0
os                            0
osv                        7772
os_name                       0
adid                          0
advert_id                     0
orderid                       0
advert_industry_inner         0
campaign_id                   0
creative_id                   0
creative_tp_dnf               0
app_cate_id                2267
f_channel                925260
app_id                     2267
inner_slot_id                 0
creative_type                 0
creative_width                0
creative_height               0
creative_is_jump              0
creative_is_download          0
creative_is_js                0
creative_is_voicead           0
creative

* Check data imbalance or not

In [3]:
train['click'].value_counts()

0    802863
1    198787
Name: click, dtype: int64

In [4]:
x = train.drop('click',axis=1)
y = train['click']

* Combine train and test data to do some data preprocessing

In [3]:
click = train['click']
train = train.drop('click',axis=1)
df = pd.concat([train,test],axis=0,ignore_index=True)

#### 3、Data Preprocessing

* Process Null data

In [4]:
# 比赛常用的缺失值处理技巧
df = df.fillna(-1)

* get day and hour from time 

In [5]:
df['day'] = df['time'].apply(lambda x: int(time.strftime("%d", time.localtime(x))))
df['hour'] = df['time'].apply(lambda x: int(time.strftime("%H", time.localtime(x))))

* transfer Boolean data

In [6]:
list(filter(lambda s:s=='bool',[df[i].dtype for i in df.columns]))
bool_feature = list(filter(lambda s:s!=0,[i if df[i].dtype=='bool' else 0  for i in df.columns]))

# 转换bool
for i in bool_feature:
    df[i] = df[i].astype(int)   #啥都不写，只写int的话，则默认为int32.

* (advert_industry_inner) extract new feature

In [7]:
df['advert_industry_inner_1'] = df['advert_industry_inner'].apply(lambda x: x.split('_')[0])

* put all ads-related features into one list

In [8]:
ad_cate_feature = ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name',
                   'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink',
                   'creative_is_jump', 'creative_is_download']

In [10]:
# creative_height  ，creative_width are numeric features

* put media-related features into one list

In [11]:
media_cate_feature = ['app_cate_id', 'f_channel', 'app_id', 'inner_slot_id']

In [12]:
# For this feature, there is only one unique value. The variance for this feature is 0 -- Invalid feature
df['creative_is_js'].value_counts()
df['app_paid'].value_counts()

0    1041674
Name: app_paid, dtype: int64

* Content features

In [13]:
content_cate_feature = ['city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model']

In [14]:
# same feature with OS
df['os_name'].value_counts()

android    948043
ios         93612
unknown        19
Name: os_name, dtype: int64

In [15]:
df['os'].value_counts()

2    948043
1     93612
0        19
Name: os, dtype: int64

* combine lists

In [16]:
origin_cate_list = ad_cate_feature + media_cate_feature + content_cate_feature

* labelencode categorical features

In [17]:
for i in origin_cate_list:
    df[i] = df[i].map(dict(zip(df[i].unique(), range(0, df[i].nunique()))))

In [18]:
df['os_name'].map(dict(zip(df['os_name'].unique(), range(0, df['os_name'].nunique())))).value_counts()

0    948043
1     93612
2        19
Name: os_name, dtype: int64

In [19]:
pd.DataFrame(df['os_name'].value_counts()).reset_index()

Unnamed: 0,index,os_name
0,android,948043
1,ios,93612
2,unknown,19


In [20]:
# check current features
df.head()

Unnamed: 0,instance_id,time,city,province,user_tags,carrier,devtype,make,model,nnt,...,creative_is_jump,creative_is_download,creative_is_js,creative_is_voicead,creative_has_deeplink,app_paid,advert_name,day,hour,advert_industry_inner_1
0,86294719979897807,2190219034,0,0,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,29,2,0
1,2699289844928136052,2190221070,1,1,"2100191,2100078,3001825,,3001781,3001791,30017...",1,0,1,1,0,...,0,0,0,0,0,0,0,29,2,0
2,3117527168445845752,2190219793,2,2,-1,1,0,2,2,0,...,0,0,0,0,0,0,1,29,2,1
3,3398484891050993371,2190221704,3,0,"2100098,gd_2100000,3001791,3001795,3002193,300...",2,0,3,3,0,...,0,0,0,0,0,0,2,29,2,2
4,2035477570591176488,2190220024,4,0,-1,0,0,4,4,1,...,0,0,0,0,0,0,0,29,2,0


#### 4、Feature Engineering

In [21]:
count_feature_list = []
def feature_count(data, features=[], is_feature=True):


    if len(set(features)) != len(features): 
        print('equal feature !!!!')
        return data
    
    

    new_feature = 'count'

    nunique = []
    for i in features:
        nunique.append(data[i].nunique())
        new_feature += '_' + i.replace('add_', '')

    
    
    if len(features) > 1 and len(data[features].drop_duplicates()) <= np.max(nunique):
        print(new_feature, 'is unvalid cross feature:')
        return data
    

    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})

    data = data.merge(temp, 'left', on=features)
  


    if is_feature:
        count_feature_list.append(new_feature)
        
    if 'day_' in new_feature:
        print('fix:', new_feature)
        data.loc[data.day == 3, new_feature] = data[data.day == 3][new_feature] * 4

    return data

In [22]:
for i in origin_cate_list:
    n = df[i].nunique()
    if n > 5:
        df = feature_count(df, [i])
        df = feature_count(df, ['day', 'hour', i])

fix: count_day_hour_adid
fix: count_day_hour_advert_id
fix: count_day_hour_orderid
fix: count_day_hour_advert_industry_inner_1
fix: count_day_hour_advert_industry_inner
fix: count_day_hour_advert_name
fix: count_day_hour_campaign_id
fix: count_day_hour_creative_id
fix: count_day_hour_creative_tp_dnf
fix: count_day_hour_app_cate_id
fix: count_day_hour_f_channel
fix: count_day_hour_app_id
fix: count_day_hour_inner_slot_id
fix: count_day_hour_city
fix: count_day_hour_province
fix: count_day_hour_nnt
fix: count_day_hour_osv
fix: count_day_hour_make
fix: count_day_hour_model


In [95]:
origin_cate_list

['adid',
 'advert_id',
 'orderid',
 'advert_industry_inner_1',
 'advert_industry_inner',
 'advert_name',
 'campaign_id',
 'creative_id',
 'creative_type',
 'creative_tp_dnf',
 'creative_has_deeplink',
 'creative_is_jump',
 'creative_is_download',
 'app_cate_id',
 'f_channel',
 'app_id',
 'inner_slot_id',
 'city',
 'carrier',
 'province',
 'nnt',
 'devtype',
 'osv',
 'os',
 'make',
 'model']

* **4.2、Add Count Features**

In [25]:
# need 52 minutes
ratio_feature_list = []
for i in media_cate_feature:
    for j in content_cate_feature + ad_cate_feature:
        new_feature = 'inf_' + i + '_' + j
        df = feature_count(df, [i, j])
        # nunique > 5 then run
        if df[i].nunique() > 5 and df[j].nunique() > 5:
            df['ratio_' + j + '_of_' + i] = df[ 'count_' + i + '_' + j] / df['count_' + i]
            df['ratio_' + i + '_of_' + j] = df['count_' + i + '_' + j] / df['count_' + j]
            ratio_feature_list.append('ratio_' + j + '_of_' + i)
            ratio_feature_list.append('ratio_' + i + '_of_' + j)
            print(i,'&',j)

app_cate_id & city
app_cate_id & province
app_cate_id & nnt
app_cate_id & osv
app_cate_id & make
app_cate_id & model


KeyboardInterrupt: 

In [26]:
# Export Data
# df.to_pickle('df_ratioed.pkl')

In [23]:
# Read Data
df = pd.read_pickle('df_ratioed.pkl')

* **4.3 Combine features list and check number of features**

In [28]:
cate_feature = origin_cate_list
num_feature = ['creative_width', 'creative_height', 'hour'] + count_feature_list + ratio_feature_list
feature = cate_feature + num_feature
print(len(feature), feature)

292 ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name', 'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink', 'creative_is_jump', 'creative_is_download', 'app_cate_id', 'f_channel', 'app_id', 'inner_slot_id', 'city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model', 'creative_width', 'creative_height', 'hour', 'count_day_hour_creative_id', 'count_adid', 'count_day_hour_adid', 'count_advert_id', 'count_day_hour_advert_id', 'count_orderid', 'count_day_hour_orderid', 'count_advert_industry_inner_1', 'count_day_hour_advert_industry_inner_1', 'count_advert_industry_inner', 'count_day_hour_advert_industry_inner', 'count_advert_name', 'count_day_hour_advert_name', 'count_campaign_id', 'count_day_hour_campaign_id', 'count_creative_id', 'count_day_hour_creative_id', 'count_adid', 'count_day_hour_adid', 'count_advert_id', 'count_day_hour_advert_id', 'count_orderid', 'count_day_hour_orderid',

* **4.4 Filter out features with less than 2 unique values**

In [29]:
for feature in cate_feature:
    if 'count_' + feature in df.keys(): 
        print(feature)
        df.loc[df['count_' + feature] < 2, feature] = -1
        df[feature] = df[feature] + 1

adid
advert_id
orderid
advert_industry_inner_1
advert_industry_inner
advert_name
campaign_id
creative_id
creative_tp_dnf
app_cate_id
f_channel
app_id
inner_slot_id
city
province
nnt
osv
make
model


* **4.5 build train and test dataset**

In [30]:
label = list(click) + [-1]* (len(df) - len(click))

In [31]:
df['label'] = label

In [32]:
# test
predict = df[df.label == -1]

In [33]:
predict_result = predict[['instance_id']]

predict_result['predicted_score'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [34]:
predict_x = predict.drop('label', axis=1)

train_x = df[df.label != -1].reset_index(drop=True)

train_y = train_x.pop('label').values

In [35]:
train_x.shape

(1001650, 283)

In [36]:
predict_x.shape

(40024, 283)

* **4.6 Sparse Matrix**

In [38]:
# build train Sparse matrix
base_train_csr = sparse.csr_matrix((len(train_x), 0))

In [39]:
# build test Sparse matrix
base_predict_csr = sparse.csr_matrix((len(predict_x), 0))

* **4.7 one-hotencode**

In [40]:
# bool->int32->float32->float64

enc = OneHotEncoder()
for feature in cate_feature:
    enc.fit(df[feature].values.reshape(-1, 1))
    base_train_csr = sparse.hstack((base_train_csr, enc.transform(train_x[feature].values.reshape(-1, 1))), 'csr', 'bool')
    base_predict_csr = sparse.hstack((base_predict_csr, enc.transform(predict_x[feature].values.reshape(-1, 1))), 'csr', 'bool')
print('one-hot prepared !')

one-hot prepared !


In [41]:
base_train_csr.shape

(1001650, 17131)

In [42]:
base_predict_csr.shape

(40024, 17131)

* **4.8、user_tags feature**

In [44]:
# Build a text feature extractor
cv = CountVectorizer(min_df=20)

for feature in ['user_tags']:
    df[feature] = df[feature].astype(str)
    cv.fit(df[feature])
    base_train_csr = sparse.hstack((base_train_csr, cv.transform(train_x[feature].astype(str))), 'csr', 'bool')
    base_predict_csr = sparse.hstack((base_predict_csr, cv.transform(predict_x[feature].astype(str))), 'csr', 'bool')
print('cv prepared !')

cv prepared !


In [45]:
# min_df = 20 Means "ignore terms appearing in less than 20 documents".
cv = CountVectorizer(min_df=20)

In [46]:
base_train_csr.shape

(1001650, 18443)

In [47]:
base_predict_csr.shape

(40024, 18443)

In [83]:
# The memory size of a single object, the unit is Byte, converted to MB
sys.getsizeof(train_x)/1024/1024

2396.562267303467

In [79]:
# The memory size of a single object, in Bytes
sys.getsizeof(base_train_csr)

56

* **Feature Selection**

In [48]:
from sklearn.feature_selection import SelectKBest,SelectPercentile
from sklearn.feature_selection import chi2

In [49]:
from sklearn.feature_selection import VarianceThreshold

In [68]:
sel_var = VarianceThreshold(threshold=0.001)

In [69]:
sel_var.fit(base_train_csr)

VarianceThreshold(threshold=0.001)

In [72]:
base_train_csr = sel_var.transform(base_train_csr)

In [73]:
base_predict_csr = sel_var.transform(base_predict_csr)

* **4.9、Sparse Matrix: transfer from bool to float**

In [74]:
train_csr = sparse.hstack( (sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype('float32')
predict_csr = sparse.hstack((sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32')

In [82]:
sys.getsizeof(train_csr)

56

#### 5、构建模型及交叉验证

In [75]:
# Build lightgbm model
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=61, reg_alpha=3, reg_lambda=1,
    max_depth=-1, n_estimators=5000, objective='binary',
    subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
    learning_rate=0.035, random_state=2018, n_jobs=10
)
# Kfold = 5
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)

* **5.1、模型训练及评估**

In [76]:
best_score = []
for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
    lgb_model.fit(train_csr[train_index], train_y[train_index],
                  eval_set=[(train_csr[train_index], train_y[train_index]),
                            (train_csr[test_index], train_y[test_index])], early_stopping_rounds=200, verbose=10)
    best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
    print(best_score)
 
    test_pred = lgb_model.predict_proba(predict_csr, num_iteration=lgb_model.best_iteration_)[:, 1]
    predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
predict_result['predicted_score'] = predict_result['predicted_score'] / 5
mean = predict_result['predicted_score'].mean()
print('mean:', mean)

Training until validation scores don't improve for 200 rounds
[10]	valid_0's binary_logloss: 0.464223	valid_1's binary_logloss: 0.46433
[20]	valid_0's binary_logloss: 0.445624	valid_1's binary_logloss: 0.445814
[30]	valid_0's binary_logloss: 0.434578	valid_1's binary_logloss: 0.434864
[40]	valid_0's binary_logloss: 0.427802	valid_1's binary_logloss: 0.428155
[50]	valid_0's binary_logloss: 0.42353	valid_1's binary_logloss: 0.424008
[60]	valid_0's binary_logloss: 0.420709	valid_1's binary_logloss: 0.421289
[70]	valid_0's binary_logloss: 0.418832	valid_1's binary_logloss: 0.419525
[80]	valid_0's binary_logloss: 0.417531	valid_1's binary_logloss: 0.418362
[90]	valid_0's binary_logloss: 0.416558	valid_1's binary_logloss: 0.417536
[100]	valid_0's binary_logloss: 0.415814	valid_1's binary_logloss: 0.416963
[110]	valid_0's binary_logloss: 0.415219	valid_1's binary_logloss: 0.416549
[120]	valid_0's binary_logloss: 0.414717	valid_1's binary_logloss: 0.416234
[130]	valid_0's binary_logloss: 0.414

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Training until validation scores don't improve for 200 rounds
[10]	valid_0's binary_logloss: 0.464231	valid_1's binary_logloss: 0.464293
[20]	valid_0's binary_logloss: 0.445611	valid_1's binary_logloss: 0.445855
[30]	valid_0's binary_logloss: 0.434524	valid_1's binary_logloss: 0.434916
[40]	valid_0's binary_logloss: 0.427767	valid_1's binary_logloss: 0.428308
[50]	valid_0's binary_logloss: 0.423472	valid_1's binary_logloss: 0.424156
[60]	valid_0's binary_logloss: 0.420671	valid_1's binary_logloss: 0.421513
[70]	valid_0's binary_logloss: 0.418804	valid_1's binary_logloss: 0.419814
[80]	valid_0's binary_logloss: 0.417489	valid_1's binary_logloss: 0.418669
[90]	valid_0's binary_logloss: 0.416515	valid_1's binary_logloss: 0.417853
[100]	valid_0's binary_logloss: 0.415772	valid_1's binary_logloss: 0.417281
[110]	valid_0's binary_logloss: 0.415176	valid_1's binary_logloss: 0.416893
[120]	valid_0's binary_logloss: 0.414668	valid_1's binary_logloss: 0.416576
[130]	valid_0's binary_logloss: 0.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Training until validation scores don't improve for 200 rounds
[10]	valid_0's binary_logloss: 0.464181	valid_1's binary_logloss: 0.464434
[20]	valid_0's binary_logloss: 0.445495	valid_1's binary_logloss: 0.445949
[30]	valid_0's binary_logloss: 0.434431	valid_1's binary_logloss: 0.435068
[40]	valid_0's binary_logloss: 0.427673	valid_1's binary_logloss: 0.428498
[50]	valid_0's binary_logloss: 0.423393	valid_1's binary_logloss: 0.424385
[60]	valid_0's binary_logloss: 0.420603	valid_1's binary_logloss: 0.4218
[70]	valid_0's binary_logloss: 0.418684	valid_1's binary_logloss: 0.420078
[80]	valid_0's binary_logloss: 0.417355	valid_1's binary_logloss: 0.418951
[90]	valid_0's binary_logloss: 0.416392	valid_1's binary_logloss: 0.418159
[100]	valid_0's binary_logloss: 0.415661	valid_1's binary_logloss: 0.417626
[110]	valid_0's binary_logloss: 0.415053	valid_1's binary_logloss: 0.417205
[120]	valid_0's binary_logloss: 0.41454	valid_1's binary_logloss: 0.416869
[130]	valid_0's binary_logloss: 0.4140

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Training until validation scores don't improve for 200 rounds
[10]	valid_0's binary_logloss: 0.464276	valid_1's binary_logloss: 0.464344
[20]	valid_0's binary_logloss: 0.445646	valid_1's binary_logloss: 0.445742
[30]	valid_0's binary_logloss: 0.434606	valid_1's binary_logloss: 0.434768
[40]	valid_0's binary_logloss: 0.427809	valid_1's binary_logloss: 0.428046
[50]	valid_0's binary_logloss: 0.423504	valid_1's binary_logloss: 0.423865
[60]	valid_0's binary_logloss: 0.420692	valid_1's binary_logloss: 0.421202
[70]	valid_0's binary_logloss: 0.418811	valid_1's binary_logloss: 0.419478
[80]	valid_0's binary_logloss: 0.417485	valid_1's binary_logloss: 0.418342
[90]	valid_0's binary_logloss: 0.416539	valid_1's binary_logloss: 0.417598
[100]	valid_0's binary_logloss: 0.41581	valid_1's binary_logloss: 0.417083
[110]	valid_0's binary_logloss: 0.41521	valid_1's binary_logloss: 0.416678
[120]	valid_0's binary_logloss: 0.414699	valid_1's binary_logloss: 0.416366
[130]	valid_0's binary_logloss: 0.414

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Training until validation scores don't improve for 200 rounds
[10]	valid_0's binary_logloss: 0.464166	valid_1's binary_logloss: 0.464425
[20]	valid_0's binary_logloss: 0.445522	valid_1's binary_logloss: 0.445995
[30]	valid_0's binary_logloss: 0.434451	valid_1's binary_logloss: 0.43514
[40]	valid_0's binary_logloss: 0.427639	valid_1's binary_logloss: 0.428498
[50]	valid_0's binary_logloss: 0.423388	valid_1's binary_logloss: 0.424436
[60]	valid_0's binary_logloss: 0.420608	valid_1's binary_logloss: 0.421833
[70]	valid_0's binary_logloss: 0.418721	valid_1's binary_logloss: 0.420131
[80]	valid_0's binary_logloss: 0.417385	valid_1's binary_logloss: 0.418985
[90]	valid_0's binary_logloss: 0.416422	valid_1's binary_logloss: 0.418202
[100]	valid_0's binary_logloss: 0.415671	valid_1's binary_logloss: 0.417618
[110]	valid_0's binary_logloss: 0.415065	valid_1's binary_logloss: 0.417192
[120]	valid_0's binary_logloss: 0.414559	valid_1's binary_logloss: 0.416888
[130]	valid_0's binary_logloss: 0.41

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


mean: 0.21046079139251384


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


* **5.2、Save Model**

In [85]:
import pickle

In [86]:
lgb_model

LGBMClassifier(colsample_bytree=0.8, learning_rate=0.035, n_estimators=5000,
               n_jobs=10, num_leaves=61, objective='binary', random_state=2018,
               reg_alpha=3, reg_lambda=1, subsample=0.8, subsample_freq=1)

In [88]:
# Save Model
with open('lgb.pickle', 'wb') as f:
    pickle.dump(lgb_model, f)

In [89]:
# Read Model
with open('lgb.pickle', 'rb') as f:
    lgb_model2 = pickle.load(f)

In [92]:
# Predict
y_p = lgb_model2.predict(train_csr)

In [97]:
from sklearn.metrics import log_loss

In [96]:
# Predict probability
y_pro = lgb_model2.predict_proba(train_csr)

In [None]:
# Log loss
log_loss(train_y,y_pro)