# Better solution

### Task 1: Importing libraries needed to use

In [1]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split



In [2]:
%load_ext watermark
%watermark -a "Nguyen Huu Duc" -p pandas,lightgbm

Author: Nguyen Huu Duc

pandas  : 1.3.5
lightgbm: 3.3.2



### Task 2: Preparing proper form of input data for both traing and testing

In [3]:
# read training data 
df = pd.read_csv('price_train.csv')
df

Unnamed: 0,date,open,high,low,close,volume,symbol
0,2021-01-04,17.05,17.70,16.90,17.55,46809600,STB
1,2021-01-05,17.55,17.95,17.45,17.70,23399500,STB
2,2021-01-06,17.90,18.35,17.70,18.00,32553600,STB
3,2021-01-07,18.05,18.45,17.85,18.45,40903700,STB
4,2021-01-08,18.80,19.50,18.65,19.40,36089200,STB
...,...,...,...,...,...,...,...
6144,2021-10-25,75.90,76.90,75.50,76.00,315300,REE
6145,2021-10-26,75.30,76.00,73.50,75.20,401500,REE
6146,2021-10-27,74.90,76.30,74.30,76.00,547600,REE
6147,2021-10-28,75.80,76.90,75.50,76.00,424400,REE


In [4]:
# Note: don't use the 'price_test.csv' 
sub = pd.read_csv('sample_submission.csv')
sub

Unnamed: 0,Id,Predicted
0,2021-11-01:BID,0.0
1,2021-11-02:BID,0.0
2,2021-11-03:BID,0.0
3,2021-11-04:BID,0.0
4,2021-11-05:BID,0.0
...,...,...
625,2021-11-23:VRE,0.0
626,2021-11-24:VRE,0.0
627,2021-11-25:VRE,0.0
628,2021-11-26:VRE,0.0


In [5]:
# create a df_test
data = [i for i in sub['Id'].str.split(':')]

df_test = pd.DataFrame(data, columns=['date', 'symbol'])
df_test

Unnamed: 0,date,symbol
0,2021-11-01,BID
1,2021-11-02,BID
2,2021-11-03,BID
3,2021-11-04,BID
4,2021-11-05,BID
...,...,...
625,2021-11-23,VRE
626,2021-11-24,VRE
627,2021-11-25,VRE
628,2021-11-26,VRE


### Task 3: Create a custom function transformer for feature engineering with datetime

In [6]:
# define a function to create "month" and "day of year" features
def datetime_features(df):
    df = pd.DataFrame(df)
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month 
    df['day_of_year'] = df['date'].dt.dayofyear
    return df[['month', 'day_of_year']]

In [7]:
# make a custom transformer
make_datetime_features = FunctionTransformer(datetime_features)

In [8]:
# check whether the custom function works or not by trying "fit_transform" method
make_datetime_features.fit_transform(df[['date']])

Unnamed: 0,month,day_of_year
0,1,4
1,1,5
2,1,6
3,1,7
4,1,8
...,...,...
6144,10,298
6145,10,299
6146,10,300
6147,10,301


### Task 4: Create transformers and pipelines

In [9]:
# model
lgb_reg = LGBMRegressor(n_estimators=300, 
                       learning_rate=0.3, 
                       random_state=1, 
                       n_jobs=-1)

# transformers
ohe = OneHotEncoder(handle_unknown='ignore')
ct = make_column_transformer((ohe, ['symbol']), 
                             (make_datetime_features, ['date']))
# create a pipeline
pipe = make_pipeline(ct, lgb_reg)

### Task 5: Fit the model with the validation set 

In [10]:
features = ['date', 'symbol']

X = df[['symbol', 'date']]
y = df['close']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
X = ct.fit_transform(X).toarray()

In [13]:
X_train = ct.fit_transform(X_train).toarray()
X_test = ct.transform(X_test).toarray()

In [14]:
lgb_reg.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)])

[1]	valid_0's l2: 660.45	valid_1's l2: 645.829
[2]	valid_0's l2: 347.669	valid_1's l2: 336.92
[3]	valid_0's l2: 188.647	valid_1's l2: 182.609
[4]	valid_0's l2: 109.273	valid_1's l2: 104.461
[5]	valid_0's l2: 67.513	valid_1's l2: 64.4485
[6]	valid_0's l2: 45.2112	valid_1's l2: 42.6165
[7]	valid_0's l2: 32.0665	valid_1's l2: 29.9868
[8]	valid_0's l2: 24.6151	valid_1's l2: 22.8361
[9]	valid_0's l2: 19.7889	valid_1's l2: 17.9154
[10]	valid_0's l2: 16.5532	valid_1's l2: 14.9826
[11]	valid_0's l2: 13.9147	valid_1's l2: 12.3215
[12]	valid_0's l2: 12.4418	valid_1's l2: 11.0323
[13]	valid_0's l2: 11.2368	valid_1's l2: 9.91608
[14]	valid_0's l2: 10.3908	valid_1's l2: 9.11974
[15]	valid_0's l2: 9.45372	valid_1's l2: 8.03913
[16]	valid_0's l2: 8.8261	valid_1's l2: 7.44306
[17]	valid_0's l2: 8.21979	valid_1's l2: 6.88611
[18]	valid_0's l2: 7.96446	valid_1's l2: 6.64543
[19]	valid_0's l2: 7.3442	valid_1's l2: 6.017
[20]	valid_0's l2: 6.91003	valid_1's l2: 5.66931
[21]	valid_0's l2: 6.56737	valid_1's

[277]	valid_0's l2: 0.928774	valid_1's l2: 0.907169
[278]	valid_0's l2: 0.927135	valid_1's l2: 0.905621
[279]	valid_0's l2: 0.925095	valid_1's l2: 0.90113
[280]	valid_0's l2: 0.922571	valid_1's l2: 0.899885
[281]	valid_0's l2: 0.918914	valid_1's l2: 0.898319
[282]	valid_0's l2: 0.915313	valid_1's l2: 0.895583
[283]	valid_0's l2: 0.913443	valid_1's l2: 0.891912
[284]	valid_0's l2: 0.910377	valid_1's l2: 0.891234
[285]	valid_0's l2: 0.906567	valid_1's l2: 0.892166
[286]	valid_0's l2: 0.904702	valid_1's l2: 0.890133
[287]	valid_0's l2: 0.902642	valid_1's l2: 0.885014
[288]	valid_0's l2: 0.900654	valid_1's l2: 0.883881
[289]	valid_0's l2: 0.896644	valid_1's l2: 0.88298
[290]	valid_0's l2: 0.894617	valid_1's l2: 0.879364
[291]	valid_0's l2: 0.892897	valid_1's l2: 0.877149
[292]	valid_0's l2: 0.89017	valid_1's l2: 0.876722
[293]	valid_0's l2: 0.887815	valid_1's l2: 0.875702
[294]	valid_0's l2: 0.88579	valid_1's l2: 0.872121
[295]	valid_0's l2: 0.88305	valid_1's l2: 0.871124
[296]	valid_0's l

LGBMRegressor(learning_rate=0.3, n_estimators=300, random_state=1)

### Task 5: fit and make predictions with the pipeline

In [15]:
features = ['date', 'symbol']

X = df[['symbol', 'date']]
y = df['close']

X_test = df_test[features]

In [16]:
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['symbol']),
                                                 ('functiontransformer',
                                                  FunctionTransformer(func=<function datetime_features at 0x000002B13B8E8F70>),
                                                  ['date'])])),
                ('lgbmregressor',
                 LGBMRegressor(learning_rate=0.3, n_estimators=300,
                               random_state=1))])

In [17]:
y_pred = pipe.predict(X_test)
y_pred

array([ 42.18320089,  42.18320089,  42.18320089,  42.18320089,
        42.18320089,  42.18320089,  42.18320089,  42.18320089,
        42.18320089,  42.18320089,  42.18320089,  42.18320089,
        42.18320089,  42.18320089,  42.18320089,  42.18320089,
        42.18320089,  42.18320089,  42.18320089,  42.18320089,
        42.18320089,  64.54403548,  64.54403548,  64.54403548,
        64.54403548,  64.54403548,  64.54403548,  64.54403548,
        64.54403548,  64.54403548,  64.54403548,  64.54403548,
        64.54403548,  64.54403548,  64.54403548,  64.54403548,
        64.54403548,  64.54403548,  64.54403548,  64.54403548,
        64.54403548,  64.54403548,  33.71165621,  33.71165621,
        33.71165621,  33.71165621,  33.71165621,  33.71165621,
        33.71165621,  33.71165621,  33.71165621,  33.71165621,
        33.71165621,  33.71165621,  33.71165621,  33.71165621,
        33.71165621,  33.71165621,  33.71165621,  33.71165621,
        33.71165621,  33.71165621,  33.71165621,  97.49

## Task 6: Make a submission

In [18]:
# substitute fake values to predictions
sub['Predicted'] = y_pred
sub

Unnamed: 0,Id,Predicted
0,2021-11-01:BID,42.183201
1,2021-11-02:BID,42.183201
2,2021-11-03:BID,42.183201
3,2021-11-04:BID,42.183201
4,2021-11-05:BID,42.183201
...,...,...
625,2021-11-23:VRE,31.341536
626,2021-11-24:VRE,31.341536
627,2021-11-25:VRE,31.341536
628,2021-11-26:VRE,31.341536


In [19]:
# make a submission (Public: 4.09518, Private: 4.46004)
sub.to_csv('submission.csv', index=False)