In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np

import pandas as pd
import xgboost as xg
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Plotting
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('data/train_preprocessed.csv')

In [5]:
y = df_train['y']
X = df_train.drop(['predictions', 'reg_s_d', 'y', 'bad', 'avg'], axis=1)

In [6]:
X.columns

Index(['st_code_snd', 'st_code_rsv', 'date_depart_year', 'date_depart_month',
       'date_depart_week', 'date_depart_day', 'date_depart_hour', 'fr_id',
       'route_type', 'is_load', 'rod', 'common_ch', 'vidsobst', 'distance',
       'snd_org_id', 'rsv_org_id', 'snd_roadid', 'rsv_roadid', 'snd_dp_id',
       'rsv_dp_id', 'avg_speed'],
      dtype='object')

In [7]:
X.shape

(3523326, 21)

In [8]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2, random_state=1)

#X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [None]:
params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 30,
    'learnnig_rage': 0.05,
    'metric': {'l2','l1'},
    'verbose': 10,
    'early_stopping_round':100
}

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
model = lgb.train(params,
                 train_set=lgb_train,
                 valid_sets=lgb_eval)

In [None]:
# prediction
y_pred = model.predict(X_test)

# accuracy check
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)

print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse) 

In [None]:
import matplotlib.pyplot as plt
x_ax = range(len(y_test))
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.xlabel('X')
plt.ylabel('Price')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()  

In [None]:
lgb.plot_importance(model, height=.5)

In [15]:
df_test = pd.read_csv('data/test_preprocessed.csv')

# MORE

In [None]:
lgb_model = lgb.LGBMRegressor(
    task = 'predict',
    application = 'regression',
    objective = 'root_mean_squared_error',
    boosting_type="gbdt",
    num_iterations = 2500,
    learning_rate = 0.05,
    num_leaves=15,
    tree_learner='feature',
    max_depth =10,
    min_data_in_leaf=7,
    bagging_fraction = 1,
    bagging_freq = 100,
    reg_sqrt='True',
    metric ='rmse',
    feature_fraction = 0.6,
    random_state=42
)

In [None]:
lgb_model.fit(X_train, y_train, verbose=10)

In [None]:
preds_lgb_model = lgb_model.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, preds_lgb_model))
print(" RMSE: %f" % (rmse_lgb ))

In [None]:
df_test = pd.read_csv('data/test_preprocessed.csv')

In [None]:
df_test.shape

In [None]:
pred = lgb_model.predict(df_test)
sol = pd.DataFrame()
sol['time'] = pred

In [None]:
sol.describe()

In [None]:
sol['time'] = sol['time'].apply(lambda r: max(r, 1))

In [None]:
sol.to_csv('data/solution_6.csv', index=False)

# tuning


In [None]:
parameters = {
    'task' : ['predict'],
    'boosting': ['gbdt' ],
    'objective': ['root_mean_squared_error'],
    'num_iterations': [7500  ], # 7500
#     'learning_rate':[  0.05, 0.005 ],
   'num_leaves':[51  ], # was 31 51 test
   'max_depth' :[ 15], # was [ 10,15,25],
   'min_data_in_leaf':[ 15 ], #15
  'feature_fraction': [ 0.8,], # >0.8
#     'bagging_fraction': [  0.6, 0.8 ],
    'bagging_freq': [   100  ],
     
}

In [None]:
%%time 

gsearch_lgb = GridSearchCV(lgb_model, param_grid = parameters, n_jobs=-1, verbose=10)
gsearch_lgb.fit(X_train,y_train)

In [None]:
print (gsearch_lgb.best_params_)

In [None]:
preds_lgb_model = gsearch_lgb.predict(X_test)

rmse_lgb = np.sqrt(mean_squared_error(y_test, preds_lgb_model))
print(" RMSE: %f" % (rmse_lgb ))

In [None]:
lgb.plot_importance(lgb_model, height=.5)

In [None]:
lgb_model = lgb.LGBMRegressor(
    task = 'predict',
    application = 'regression',
    objective = 'root_mean_squared_error',
    boosting_type="gbdt",
    learning_rate = 0.05,
    tree_learner='feature',
    bagging_fraction = 1,
    bagging_freq = 100,
    reg_sqrt='True',
    metric ='rmse',
    feature_fraction = 0.6,
    random_state=42,
    max_depth= 15,
    min_data_in_leaf= 15,
    num_iterations=7500,
    num_leaves=51,
)

In [None]:
lgb_model.fit(X, y, verbose=10)

In [None]:
preds_lgb_model = lgb_model.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, preds_lgb_model))
print(" RMSE: %f" % (rmse_lgb ))

In [None]:
pred = lgb_model.predict(df_test)
sol = pd.DataFrame()
sol['time'] = pred

In [None]:
sol.describe()

In [None]:
sol['time'] = sol['time'].apply(lambda r: max(r, 2))

In [None]:
sol.to_csv('data/solution_8.csv', index=False)

# XGB

In [3]:
import xgboost as xgb

In [4]:
regressor=xgb.XGBRegressor(eval_metric='rmsle')


In [11]:
regressor.fit(X_train, y_train)

In [12]:
regressor_model = regressor.predict(X_test)
rmse_regressor = np.sqrt(mean_squared_error(y_test, regressor_model))
print(" RMSE: %f" % (rmse_regressor ))

 RMSE: 38.543941


In [13]:
regressor.fit(X, y)

In [24]:
df_test.shape

(1182903, 21)

In [20]:
pred = regressor.predict(df_test)
sol = pd.DataFrame()
sol['time'] = pred

In [21]:
sol.describe()

Unnamed: 0,time
count,1182903.0
mean,96.19348
std,93.55412
min,-55.70679
25%,22.7886
50%,74.38152
75%,139.5524
max,4374.418


In [22]:
sol['time'] = sol['time'].apply(lambda r: max(r, 1))

In [23]:
sol.to_csv('data/solution_9.csv', index=False)

In [25]:
sol.shape

(1182903, 1)

In [26]:
sol7 = pd.read_csv('data/solution_7.csv')

sol9 = pd.read_csv('data/solution_9.csv')

In [27]:
sol7.shape

(1182903, 1)

In [29]:
sol9.shape

(1182903, 1)

In [None]:
sol

In [33]:
comb = pd.DataFrame()
comb

In [34]:
comb['sol7'] = sol7['time']
comb['sol9'] = sol9['time']

In [36]:
comb['time'] = (comb['sol7'] + comb['sol9']) / 2

In [38]:
comb[comb['time'] > 0]

Unnamed: 0,sol7,sol9,time
0,167.670811,154.735138,161.202975
1,139.029587,143.220840,141.125214
2,204.683241,204.490967,204.587104
3,70.296667,79.003647,74.650157
4,116.412118,283.810455,200.111286
...,...,...,...
1182898,195.022460,201.256516,198.139488
1182899,275.062068,283.909180,279.485624
1182900,4.904554,5.309921,5.107237
1182901,43.747659,50.558861,47.153260


In [40]:
comb[['time']].to_csv('data/solution_10.csv', index=False)

In [41]:
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

# try out every combination of the above values
search = GridSearchCV(regressor, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)

KeyboardInterrupt: 