In [1]:
%load_ext dotenv
%dotenv

## Imports

In [2]:
import pandas as pd
import lightgbm as lgbm
import os
from sklearn.model_selection import train_test_split
import gc


## Config

In [3]:
loc_train = os.getenv("LOC_PRE") + "/train.csv"
loc_test = os.getenv("LOC_PRE") + "/test.csv"


## Loading Data

In [4]:
df_train = pd.read_csv(loc_train,index_col=[0])
df_train.head()

Unnamed: 0_level_0,sales,store_nbr,family,onpromotion,year,month,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,1,AUTOMOTIVE,0,2013,1,1
1,0.0,1,BABY CARE,0,2013,1,1
2,0.0,1,BEAUTY,0,2013,1,1
3,0.0,1,BEVERAGES,0,2013,1,1
4,0.0,1,BOOKS,0,2013,1,1


In [5]:
df_test = pd.read_csv(loc_test,index_col=[0])
df_test.head()

Unnamed: 0_level_0,store_nbr,family,onpromotion,year,month,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3000888,1,AUTOMOTIVE,0,2017,8,16
3000889,1,BABY CARE,0,2017,8,16
3000890,1,BEAUTY,2,2017,8,16
3000891,1,BEVERAGES,20,2017,8,16
3000892,1,BOOKS,0,2017,8,16


In [6]:
for dset in [df_train,df_test]:
    dset["family"] = dset["family"].astype("category")

## Splitting to Train/Val set

In [7]:
y = df_train.pop("sales")
y.head(3)

id
0    0.0
1    0.0
2    0.0
Name: sales, dtype: float64

In [8]:
X = df_train
# Have to see if this is useful or not
del df_train
gc.collect()
X.head(3)

Unnamed: 0_level_0,store_nbr,family,onpromotion,year,month,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,AUTOMOTIVE,0,2013,1,1
1,1,BABY CARE,0,2013,1,1
2,1,BEAUTY,0,2013,1,1


In [9]:
X_train,X_val,y_train,y_val = train_test_split(X,y,random_state=42,train_size=0.7)
print(X_train.shape,X_val.shape)

(2100621, 6) (900267, 6)


In [10]:
lgbm_train = lgbm.Dataset(X_train,label=y_train)
del X_train,y_train
lgbm_val = lgbm.Dataset(X_val,label=y_val)
del X_val,y_val
gc.collect()

0

## Creating and Training the model

In [11]:
params = {

}

In [12]:
model = lgbm.train(params,train_set=lgbm_train,valid_sets=[lgbm_train,lgbm_val])

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 2100621, number of used features: 6
[LightGBM] [Info] Start training from score 357.594217
[1]	training's l2: 1.04016e+06	valid_1's l2: 1.05349e+06
[2]	training's l2: 901874	valid_1's l2: 913960
[3]	training's l2: 789729	valid_1's l2: 800603
[4]	training's l2: 698426	valid_1's l2: 708072




[5]	training's l2: 622404	valid_1's l2: 631477
[6]	training's l2: 560726	valid_1's l2: 568958
[7]	training's l2: 509810	valid_1's l2: 517410
[8]	training's l2: 467998	valid_1's l2: 475231
[9]	training's l2: 432840	valid_1's l2: 439527
[10]	training's l2: 404435	valid_1's l2: 410547
[11]	training's l2: 377525	valid_1's l2: 383679
[12]	training's l2: 356569	valid_1's l2: 362763
[13]	training's l2: 339179	valid_1's l2: 345386
[14]	training's l2: 323057	valid_1's l2: 329337
[15]	training's l2: 308442	valid_1's l2: 314417
[16]	training's l2: 295102	valid_1's l2: 300974
[17]	training's l2: 284125	valid_1's l2: 289862
[18]	training's l2: 274986	valid_1's l2: 280619
[19]	training's l2: 267274	valid_1's l2: 272973
[20]	training's l2: 261091	valid_1's l2: 266656
[21]	training's l2: 253683	valid_1's l2: 259214
[22]	training's l2: 247130	valid_1's l2: 252588
[23]	training's l2: 241999	valid_1's l2: 247294
[24]	training's l2: 237481	valid_1's l2: 242590
[25]	training's l2: 234353	valid_1's l2: 2394

In [13]:
model.feature_importance()

array([1371,  619,  313,  256,  273,  168], dtype=int32)

## Prediction

In [14]:
output = pd.DataFrame()

In [15]:
output.index = df_test.index

In [16]:
output["sales"] = model.predict(df_test)

In [17]:
output.head()

Unnamed: 0_level_0,sales
id,Unnamed: 1_level_1
3000888,3.092327
3000889,3.092327
3000890,40.481426
3000891,2191.735183
3000892,3.092327


In [18]:
output.to_csv("../submission.csv")