In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold


import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna
from scipy.stats import ks_2samp
from tqdm import tqdm

from lightgbm import LGBMRegressor
from boruta import BorutaPy

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    

Using TensorFlow backend.


In [7]:
#train_df = pd.read_csv('train_clean.csv')

train_df = pd.read_csv('train_other.csv',encoding='gbk')
test_df = pd.read_csv('test_other.csv',encoding='gbk')
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate']
for i in columns:
    train_df[i] = train_df[i].astype('category')
    test_df[i] = test_df[i].astype('category')



feats =  [col for col in train_df.columns if col not in ['tradeMoney']]
categorical_feats = ['rentType', 'houseFloor', 'houseDecoration', 'region', 'plate','communityName']
diff_features =list(set(feats)^set(categorical_feats))
diff_features.remove('roomsarea')


In [5]:
random_state = 2333


In [8]:
#train_df[feats] = train_df[feats].fillna(0)

lgbmclf = LGBMRegressor(
    device='gpu',
    gpu_platform_id= 1,
    gpu_device_id= 0,
    objective= "regression_l2",
    metric= "rmse",
    boosting= 'gbdt',
    max_depth= 9,
    num_leaves= 13,
    learning_rate= 0.01,
    bagging_freq= 5,
    bagging_fraction= 0.7,
    feature_fraction= 0.7,
    min_data_in_leaf= 80,
    min_sum_heassian_in_leaf= 10,
    tree_learner= "serial",
    boost_from_average= "false",
    #"lambda_l1" : 5,
    #"lambda_l2" : 5,
    bagging_seed= random_state,
    verbosity= 1,
    seed= random_state
)

borutaselector = BorutaPy(lgbmclf, n_estimators=2000, verbose=2)

start_time = timer(None)
borutaselector.fit(train_df[diff_features].values, train_df['tradeMoney'].values) 
timer(start_time)





Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	55
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	0
Rejected: 	55


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	0
Rejected: 	55


<contextlib._GeneratorContextManager at 0x54564e0>

In [9]:
print(borutaselector.support_)
print(train_df[feats].columns[borutaselector.support_])
print ('\n Initial features: ', train_df[feats].columns.tolist() )

# number of selected features
print ('\n Number of selected features:')
print (borutaselector.n_features_)

feature_df = pd.DataFrame(train_df[feats].columns.tolist(), columns=['features'])
feature_df['rank']=borutaselector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print ('\n Top %d features:' % borutaselector.n_features_)
print (feature_df.head(borutaselector.n_features_))
feature_df.to_csv('boruta-train_clean1-ranking.csv', index=False)

# check ranking of features
print ('\n Feature ranking:')
print (borutaselector.ranking_)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False]


IndexError: boolean index did not match indexed array along dimension 0; dimension is 62 but corresponding boolean dimension is 55

In [None]:
features = np.array(feature_df[['features']][feature_df['rank'] == 1]).tolist()
print(features)
features2 = []
for i in features:
    print(i)
    features2.append(i[0])
print(train_df[features2])
train_df[features2].to_csv('train_clean11.csv', index=False)

test_df = pd.read_csv('test_clean1.csv')
test_df[features2].to_csv('test_clean11.csv', index=False)