In [None]:
"""
LGBM Regression on TfIDF of text features and One-Hot-Encoded Categoricals
Featues based on Alexandu Papiu's (https://www.kaggle.com/apapiu) script: https://www.kaggle.com/apapiu/ridge-script
LGBM based on InfiniteWing's (https://www.kaggle.com/infinitewing) script: https://www.kaggle.com/infinitewing/lightgbm-example
"""
#TODO don't use dummies, but categorical features of LightGBM

import pandas as pd
import numpy as np
import scipy

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import lightgbm as lgb
import time
from nltk.corpus import stopwords
import gc

start = fixstart = time.time()

NUM_BRANDS = 2500
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000

print("Reading in Data")

df_train = pd.read_csv('../input/train.tsv', sep='\t')
df_test = pd.read_csv('../input/test.tsv', sep='\t')

df = pd.concat([df_train, df_test], 0)
nrow_train = df_train.shape[0]
y_train = np.log1p(df_train["price"])

del df_train
gc.collect()

print(df.memory_usage(deep = True))

df["category_name"] = df["category_name"].fillna("Other").astype("category")
df["brand_name"] = df["brand_name"].fillna("unknown")

pop_brands = df["brand_name"].value_counts().index[:NUM_BRANDS]
df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"

df["item_description"] = df["item_description"].fillna("None")
df["item_condition_id"] = df["item_condition_id"].astype("category")
df["brand_name"] = df["brand_name"].astype("category")

# description contains interesting words
interesting_words = ['new', 'perfect', 'fit', 'used', 'super', 'cute', 'excellent',
                     'great', 'retail', '[rm]', 'never' ]
for word in interesting_words:
    df[word] = df['item_description'].apply(lambda x : word in x.lower())


print(df.memory_usage(deep = True))

print("Encodings")
count = CountVectorizer(min_df=NAME_MIN_DF)
X_name = count.fit_transform(df["name"])

print("Category Encoders")
unique_categories = pd.Series("/".join(df["category_name"].unique().astype("str")).split("/")).unique()
count_category = CountVectorizer()
X_category = count_category.fit_transform(df["category_name"])


stopw = stopwords.words('english') + interesting_words + ['cant', 'ask', 'size']
print("Descp encoders")
count_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP, 
                              ngram_range = (1,3),
                              stop_words = stopw)
X_descp = count_descp.fit_transform(df["item_description"])

print("Brand encoders")
vect_brand = LabelBinarizer(sparse_output=True)
X_brand = vect_brand.fit_transform(df["brand_name"])

In [6]:
df["new"] = df["new"].astype("category")
print("Dummy Encoders")
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(df[[
    'item_condition_id', 'shipping','new']], sparse = True).values)

Dummy Encoders


In [7]:
X = scipy.sparse.hstack((X_dummies, 
                         X_descp,
                         X_brand,
                         X_category,
                         X_name)).tocsr()

print([X_dummies.shape, X_category.shape, 
       X_name.shape, X_descp.shape, X_brand.shape])

X_train = X[:nrow_train]
X_test = X[nrow_train:]

end = time.time()
print("Time taken reading and encoding  {}.".format((end-start)))

[(2175894, 8), (2175894, 1021), (2175894, 21257), (2175894, 50000), (2175894, 2501)]
Time taken reading and encoding  2091.373414993286.


In [8]:
X_train.shape

(1482535, 74787)

In [9]:
#LGBM Training
start = time.time()

train_X, valid_X, train_y, valid_y = train_test_split(X_train, y_train, 
                                                      test_size = 0.1, 
                                                      random_state = 144) 
evalset = [(train_X, train_y),(valid_X, valid_y)]
#evalset = [(valid_X, valid_y)]
lgbmodel = lgb.LGBMRegressor(learning_rate= 0.8,
    objective='regression',
    max_depth=4,
    num_leaves=100,      #number of leaves in one tree
    min_data_in_leaf=20, #minimal number of data in one leaf.
    feature_fraction=1.0, #LightGBM will randomly select part of features on each iteration if feature_fraction smaller than 1.0.
    min_split_gain=0.0, #the minimal gain to perform split
    cat_l2=10.,         #L2 regularization in categorical split
    min_data_in_bin=3,  #min number of data inside one bin, use this to avoid one-data-one-bin 
    silent=True,
    metric='rmse',
#    train_metric=False,
#    metric_freq=10,
    n_estimators=300,
    cat_smooth=10, #this can reduce the effect of noises in categorical features, especially for categories with few data
    max_bin=8192,
    num_threads=2,
    two_round_loading=True #set this to true if data file is too big to fit in memory
        )
lgbmodel.fit(X=train_X, y=train_y, eval_set=evalset, eval_names=['train', 'valid'],
            eval_metric='rmse',
            early_stopping_rounds=50 )


preds = lgbmodel.predict(X_test)
end = time.time()
print("Time taken training LGBM  {}.".format((end-start)))

[1]	train's rmse: 0.691604	valid's rmse: 0.693228
Training until validation scores don't improve for 50 rounds.
[2]	train's rmse: 0.668192	valid's rmse: 0.670016
[3]	train's rmse: 0.65493	valid's rmse: 0.657355
[4]	train's rmse: 0.644936	valid's rmse: 0.64692
[5]	train's rmse: 0.636154	valid's rmse: 0.637992
[6]	train's rmse: 0.630309	valid's rmse: 0.632032
[7]	train's rmse: 0.62539	valid's rmse: 0.627467
[8]	train's rmse: 0.621148	valid's rmse: 0.623278
[9]	train's rmse: 0.617552	valid's rmse: 0.619539
[10]	train's rmse: 0.613801	valid's rmse: 0.616048
[11]	train's rmse: 0.610565	valid's rmse: 0.612875
[12]	train's rmse: 0.607762	valid's rmse: 0.610223
[13]	train's rmse: 0.604363	valid's rmse: 0.60697
[14]	train's rmse: 0.601225	valid's rmse: 0.603833
[15]	train's rmse: 0.59817	valid's rmse: 0.600838
[16]	train's rmse: 0.595949	valid's rmse: 0.598874
[17]	train's rmse: 0.593455	valid's rmse: 0.596495
[18]	train's rmse: 0.591314	valid's rmse: 0.594267
[19]	train's rmse: 0.589229	valid'

[160]	train's rmse: 0.508583	valid's rmse: 0.515298
[161]	train's rmse: 0.508382	valid's rmse: 0.515124
[162]	train's rmse: 0.508196	valid's rmse: 0.514991
[163]	train's rmse: 0.507988	valid's rmse: 0.514853
[164]	train's rmse: 0.507771	valid's rmse: 0.514671
[165]	train's rmse: 0.507586	valid's rmse: 0.514456
[166]	train's rmse: 0.507451	valid's rmse: 0.514265
[167]	train's rmse: 0.507274	valid's rmse: 0.514134
[168]	train's rmse: 0.507128	valid's rmse: 0.514032
[169]	train's rmse: 0.506971	valid's rmse: 0.513906
[170]	train's rmse: 0.506801	valid's rmse: 0.513797
[171]	train's rmse: 0.506401	valid's rmse: 0.51342
[172]	train's rmse: 0.506238	valid's rmse: 0.513331
[173]	train's rmse: 0.506077	valid's rmse: 0.513138
[174]	train's rmse: 0.505859	valid's rmse: 0.512949
[175]	train's rmse: 0.50568	valid's rmse: 0.512783
[176]	train's rmse: 0.505535	valid's rmse: 0.512646
[177]	train's rmse: 0.505397	valid's rmse: 0.512581
[178]	train's rmse: 0.505147	valid's rmse: 0.512315
[179]	train's 

In [None]:
#Ridge Training
start = time.time()

modelr = Ridge(solver = "lsqr", fit_intercept=False)
modelr.fit(X_train, y_train)

preds += modelr.predict(X_test)
preds /= 2
end = time.time()
print("Time taken training Ridge  {}.".format((end-start)))


df_test["price"] = np.expm1(preds)
df_test[["test_id", "price"]].to_csv("ff_LGBM_Ridge_3.csv", index = False)
end = time.time()
print("Time taken by the Kernel  {}.".format((end-fixstart)))

In [11]:
lgbmodel.dump_model()

AttributeError: 'LGBMRegressor' object has no attribute 'dump_model'

In [15]:
booster = lgbmodel.booster_
booster.save_model('sk-lgbm-Ridge-03.txt')

In [17]:
lgbmodel.feature_importances_

array([144,  85,  30, ...,   0,   0,   0])

In [22]:
ax=lgb.plot_importance(lgbmodel, max_num_features=10)