In [7]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True,encoding = "gbk")
    df = reduce_mem_usage(df)
    return df

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
    

In [11]:
PATH="../input/"
os.listdir(PATH)
train = import_data(PATH+'train_clean1.csv')
test = import_data(PATH+'test_clean1.csv')

Memory usage after optimization is: 9.68 MB
Decreased by 77.5%
Memory usage after optimization is: 0.59 MB
Decreased by 77.1%




In [16]:
train['tradeMoney']

0         2000.0
1         2000.0
2        16000.0
3         1600.0
4         2900.0
5        14000.0
6         1000.0
7         1800.0
8         1450.0
9         1700.0
10        1500.0
11        2000.0
12        2200.0
13        1600.0
14        2500.0
15        6000.0
16        1600.0
17        2800.0
18        1800.0
19        1600.0
20        3400.0
21        1500.0
22        3600.0
23        8000.0
24        6000.0
25        3000.0
26        1300.0
27        2100.0
28        2200.0
29        1380.0
          ...   
41410     2970.0
41411     1890.0
41412     1890.0
41413     1860.0
41414    14000.0
41415     3630.0
41416     1990.0
41417     1990.0
41418     1890.0
41419     5800.0
41420     1660.0
41421     1560.0
41422     1460.0
41423     2690.0
41424     3990.0
41425     2290.0
41426     2760.0
41427    15000.0
41428    13500.0
41429     2990.0
41430     1430.0
41431     1690.0
41432     1690.0
41433     1460.0
41434     2290.0
41435     2190.0
41436     2090.0
41437     3190

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['tradeMoney'], axis=1), train['tradeMoney'], test_size=0.20, random_state=314)

In [18]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [29]:
import lightgbm as lgb
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'rmse', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [30]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [31]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='r2',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)


In [32]:
gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 3 folds for each of 100 candidates, totalling 300 fits




ValueError: y contains new labels: [  150.   250.   530.   710.  1120.  1295.  1370.  1520.  1755.  2210.
  2345.  2510.  2720.  2825.  3020.  3120.  3334.  3417.  3440.  3605.
  3680.  3720.  3780.  3788.  3910.  3916.  3948.  4166.  4220.  4295.
  4520.  4580.  4780.  5080.  5320.  5530.  5560.  5650.  5670.  5688.
  5693.  5697.  5714.  5794.  5850.  6066.  6210.  6280.  6430.  6520.
  6550.  6640.  6667.  6690.  6736.  6825.  6830.  6990.  7130.  7470.
  7590.  7650.  7666.  7860.  7890.  8166.  9360.  9380.  9550. 10400.
 10600. 11107. 11700. 13400. 13650. 13880. 14136. 14700. 14750. 15200.
 16700. 16900. 17200. 17600. 18250. 18800. 18900. 20200. 20700. 22200.
 23300. 28800. 29166. 34063. 37000. 38500. 41000. 46000. 56000. 58000.]

In [None]:
clf_final = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
clf_final.set_params(gs.best_params_)

In [None]:
clf_final.fit(X_train, y_train, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])