# Feature Engineering

<b>importing datasets</b>

In [1]:
%%time
import pandas as pd
data = pd.read_csv('train.csv',nrows=600000,parse_dates=['pickup_datetime'])

Wall time: 1min 54s


In [2]:
#Copying data to train
train = data.copy()


Dropping any missing values along the rows

In [3]:
train= train.dropna(axis=0, how='any')
print("The number of instances after removal of Nan Values is",(data.shape[0]-train.shape[1]))

The number of instances after removal of Nan Values is 599992


In [4]:
train.isna().sum() #Missing values are removed

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [5]:
train.shape

(599992, 8)

In [6]:
print("The no. of instances having fare_amount greater than 150 is",len(train[train.fare_amount>150]))

The no. of instances having fare_amount greater than 150 is 66


Removing rows with fare_amount that is <b> less than zero or greater than 150 </b> along rows as it doesn't make any more sense in data based on domain knowledge

In [7]:
fare_outlier= train[(train['fare_amount']<0) | (train['fare_amount']>150)]
train = train.drop(fare_outlier.index,axis=0)

#checking for fare_amount that is less than zero and its being sucessfully removed from data
train.loc[train['fare_amount']>150].head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count


It is inferred from the source https://www.flickr.com/places/info/2459115 that New York is bounded by the location cordinates <b>Latitude range is  from 40.568973 to 41.709555 and Longitude range is from -74.263242 to -72.986532 </b>so hence any cordinates not within these cordinates are not considered by us as we are only concerned with dropoffs which are within New York. 

<b>Removing latitude and longitude outliers  along rows</b>

In [8]:
train = train[((train.dropoff_longitude >= -74.263242) & (train.dropoff_longitude <= -72.986532) &\
                      (train.dropoff_latitude >= 40.568) & (train.dropoff_latitude <= 41.709)) & \
                       ((train.pickup_longitude >= -74.26) & (train.pickup_latitude >= 40.568)& \
                       (train.pickup_longitude <= -72.9865) & (train.pickup_latitude <= 41.7095))]

print("The shape of train data  after removing latitude and longitude outliers" , train.shape)

The shape of train data  after removing latitude and longitude outliers (587203, 8)


# Harvesian Formula to calculate distance given location coordinates
haversine(θ) = sin²(θ/2)

Eventually, the formual boils down to the following where φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km) to include latitude and longitude coordinates (A and B in this case).

a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)

c = 2 * atan2( √a, √(1−a) )

d = R ⋅ c
d = Haversine distance
link:https://community.esri.com/groups/coordinate-reference-systems/blog/2017/10/05/haversine-formula

In [9]:
#Creating new column H_Distance
import numpy as np
def haversine_distance(lat1, long1, lat2, long2):
    data = [train]
    for i in data:
        R = 6371  #radius of earth in kilometers
        #R = 3959 #radius of earth in miles
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        d = (R * c) #in kilometers
        i['Distance_in_kms'] = d
    return d
haversine_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

0         1.030764
1         8.450134
2         1.389525
3         2.799270
4         1.999157
            ...   
599995    0.731451
599996    1.923936
599997    5.226628
599998    3.396247
599999    4.559847
Length: 587203, dtype: float64

In [10]:
train.head(5)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Distance_in_kms
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,1.030764
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,8.450134
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,1.389525
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,2.79927
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,1.999157


<b>Creating new columns as year, month ,date , hour,  day of week </b>


In [11]:
all_data = [train]
for i in all_data:
    i['Year'] = i['pickup_datetime'].dt.year
    i['Month'] = i['pickup_datetime'].dt.month
    i['Date'] = i['pickup_datetime'].dt.day
    i['Day of Week'] = i['pickup_datetime'].dt.dayofweek
    i['Hour'] = i['pickup_datetime'].dt.hour

train.columns #Manual checking

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'Distance_in_kms', 'Year', 'Month', 'Date',
       'Day of Week', 'Hour'],
      dtype='object')

<b>Dropping  where pickup latitude and pickup longitude are 0 but dropoff latitude and longitude are not 0, but the fare is 0
 and vice versa along rows</b>

In [12]:
train = train.drop(train.loc[((train['pickup_latitude']==0) & (train['pickup_longitude']==0))&\
                             ((train['dropoff_latitude']!=0) & (train['dropoff_longitude']!=0)) 
                             & (train['fare_amount']==0)].index, axis=0)

train=train.drop(train.loc[((train['pickup_latitude']!=0) & (train['pickup_longitude']!=0))&\
          ((train['dropoff_latitude']==0) & (train['dropoff_longitude']==0)) 
          & (train['fare_amount']==0)].index,axis=0)

In [13]:
#counting number rows in distances_in_kms column with values zero
print("THe no. of rows with distance zero values is ",len(train[train['Distance_in_kms']==0]))

THe no. of rows with distance zero values is  6253


<b> Observation</b>

We can see a few rows with distance =0. This could be due to 2 reasons

The cab waited the whole time and the passenger eventually cancelled. That's why the pickup and drop co-ordinates are the same and maybe, the passenger was charged for the waiting time.
The pickup and drop co-ordinates were not entered. In other words, these are missing values and need to ne imputed.

With Google search i found formula to calculate missing values

$$2.5 base-price + $1.56/km --> 6AM to 8PM Mon-Fri

$$3.0 base-price + $1.56/km --> 8PM to 6AM Mon-Fri and Sat&Sun

# Handling Missing Values

In [14]:
#Dropping rows where fare_amount and Distance_in_kms are both zero along rows
train = train.drop(train[(train['Distance_in_kms']==0)&(train['fare_amount']==0)].index, axis = 0)

In [15]:
#Dropping rows Between 6AM and 8PM on Mon-Fri with Distance_in_kms is zero and fare is less than base price $2.5 along rows
rush_hour = train.loc[(((train['Hour']>=6)&(train['Hour']<=20)) &\
                       ((train['Day of Week']>=1) & (train['Day of Week']<=5)) & 
                       (train['Distance_in_kms']==0) & 
                       (train['fare_amount'] < 2.5))]
rush_hour
train=train.drop(rush_hour.index, axis=0)

In [16]:
#Dropping rows Between 8PM and 6AM on Mon-Fri with Distance_in_kms is zero and fare is less than base price $3.0 along rows
non_rush_hour = train.loc[(((train['Hour']<6)|(train['Hour']>20)) &
                           ((train['Day of Week']>=1)&(train['Day of Week']<=5)) & 
                           (train['Distance_in_kms']==0) & 
                           (train['fare_amount'] < 3.0))]
train=train.drop(non_rush_hour.index, axis=0)

In [17]:
#Dropping rows for Saturday and Sunday all hours with Distance_in_kms is zero and fare is less than base price $3.0 along rows
weekends = train.loc[((train['Day of Week']==0) | (train['Day of Week']==6)) &\
                     (train['Distance_in_kms']==0) & 
                     (train['fare_amount'] < 3.0)]
train=train.drop(weekends.index, axis=0)

<b>Fare is 0, but Distance is not 0</b>. These values need to be imputed. we shall use the following formula

fare = 2.5 + 1.56(Distance_in_kms)


In [18]:
#Creating data with distance not zero but fare is zero
calculate_fare = train.loc[(train['Distance_in_kms']!=0) & (train['fare_amount']==0)]
print("The number of rows with distance is non-zero but fare is zero is ",len(calculate_fare))

#Calculation of fare_amount based on given distance using formula
calculate_fare['fare_amount']=calculate_fare.apply(lambda x:((x.loc['Distance_in_kms']*1.56)+2.50),axis=1)

#Updating modified rows in original train dataset
train.update(calculate_fare)

The number of rows with distance is non-zero but fare is zero is  13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<b>Fare is not 0, but Distance is 0 </b>. These values need to be imputed using formula

distance = (fare_amount - 2.5)/1.56

In [19]:
#Creating data with distance zero but fare is not zero
calculate_distance= train.loc[(train['Distance_in_kms']==0) & (train['fare_amount']>3.0)]
print("The number of rows with distance is zero but fare is greater then base price $3 is ",len(calculate_distance))

#Calculating Distance based on given price using formula
calculate_distance['Distance_in_kms'] = calculate_distance.apply(lambda row: ((row['fare_amount']-2.50)/1.56), axis=1)

#Updating modified rows in original train dataset
train.update(calculate_distance)

The number of rows with distance is zero but fare is greater then base price $3 is  5712


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
print("The no. of rows removed after preprocessing is ",(data.shape[0]-train.shape[0]))
print("The original no.of rows and columns in actual dataset is " ,data.shape)
print("the no.of rows and columns in after data preprocessing is " ,train.shape)

The no. of rows removed after preprocessing is  13013
The original no.of rows and columns in actual dataset is  (600000, 8)
the no.of rows and columns in after data preprocessing is  (586987, 14)


Removing key column and pickup_datatime as it doesn't make sense todata as key is referred as identity

In [21]:
#removing features along the rows
train = train.drop(['key','pickup_datetime'],axis=1)

In [22]:
train.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
       'Distance_in_kms', 'Year', 'Month', 'Date', 'Day of Week', 'Hour'],
      dtype='object')

# Feature Selection

In [23]:
train.corr(method = 'pearson')

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Distance_in_kms,Year,Month,Date,Day of Week,Hour
fare_amount,1.0,0.385264,-0.194624,0.301689,-0.165369,0.016453,0.837565,0.119014,0.026241,0.002025,0.003856,-0.019704
pickup_longitude,0.385264,1.0,0.125869,0.388257,0.132394,0.001164,0.437031,0.002388,0.005622,7.1e-05,-0.023192,0.017251
pickup_latitude,-0.194624,0.125869,1.0,0.149409,0.465461,-0.008012,-0.144887,-0.019741,-0.005528,-0.001722,-0.037123,0.02892
dropoff_longitude,0.301689,0.388257,0.149409,1.0,0.225522,-0.001528,0.349265,-0.000887,0.0039,0.002425,-0.000411,-0.042557
dropoff_latitude,-0.165369,0.132394,0.465461,0.225522,1.0,-0.004921,-0.124536,-0.011963,-0.004816,-0.001029,-0.02891,0.020097
passenger_count,0.016453,0.001164,-0.008012,-0.001528,-0.004921,1.0,0.010592,0.006048,0.005174,0.004385,0.037319,0.016179
Distance_in_kms,0.837565,0.437031,-0.144887,0.349265,-0.124536,0.010592,1.0,0.015896,0.013555,0.002405,0.015384,-0.030437
Year,0.119014,0.002388,-0.019741,-0.000887,-0.011963,0.006048,0.015896,1.0,-0.117803,-0.009765,0.009265,0.002355
Month,0.026241,0.005622,-0.005528,0.0039,-0.004816,0.005174,0.013555,-0.117803,1.0,-0.016281,-0.008687,-0.003581
Date,0.002025,7.1e-05,-0.001722,0.002425,-0.001029,0.004385,0.002405,-0.009765,-0.016281,1.0,0.007365,0.001888


<b>Creating train and test data and labels for training model</b>

In [24]:
train_data = train.iloc[ :len(train)-10000,:]

train_label = train_data['fare_amount']
train_data.drop('fare_amount',axis=1,inplace = True)

test_data = train.iloc[len(train)-10000:,:]
test_label = test_data['fare_amount']
test_data.drop('fare_amount',axis=1,inplace = True)



print("The no. of rows and cols  train dataset is ", train_data.shape)
print("The no. of rows  and cols  test dataset is", test_data.shape)
print("The no. of labels in train dataset ",train_label.shape)
print("The no. of labels in train dataset ",test_label.shape)

The no. of rows and cols  train dataset is  (576987, 11)
The no. of rows  and cols  test dataset is (10000, 11)
The no. of labels in train dataset  (576987,)
The no. of labels in train dataset  (10000,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [25]:
train_data.columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count', 'Distance_in_kms', 'Year',
       'Month', 'Date', 'Day of Week', 'Hour'],
      dtype='object')

In [26]:
test_data.columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count', 'Distance_in_kms', 'Year',
       'Month', 'Date', 'Day of Week', 'Hour'],
      dtype='object')

# BenchMark Model
As the Gradient boosting was winner of this kaggle competition.The <b>benchmark Model that i selected was LightGBM.,</b>
LightGBM is an open-source framework for gradient boosted machines. By default LightGBM will train a Gradient Boosted Decision Tree (GBDT), but it also supports random forests.

The framework is fast and was designed for distributed training. It supports large-scale datasets and training on the GPU. In many cases LightGBM has been found to be more accurate and faster than XGBoost, though this is problem dependent.

Link https://www.avanwyk.com/an-overview-of-lightgbm/


Although we use other algorithms that are mentioned below

Linear Regression

DecisionTreeRegressor

RandomForestRegressor

XGBoost

<b>importing sklearn libraries </b>

In [27]:
#importing libraries
import math
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
import xgboost as xgb
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error

<b>Applying LightGBM</b>

In [28]:
#!pip install lightgbm
import lightgbm as lgbm
#Defing parameters

param_lgbm={'boosting_type':'gbdt',
    'learning_rate': 0.04,
 'max_depth': 10,
 'min_child_weight': 4,
 'n_estimators': 100,
 'n_jobs': -1,
 'objective': 'regression',
 'random_state': 42,
 'reg_lambda': 0.003}


train_set = lgbm.Dataset(train_data, train_label, silent=True)
#training data
model = lgbm.train(param_lgbm, train_set = train_set, num_boost_round=300)
#predicting test data
pred_lgbm = model.predict(test_data)
lgbm_rmse = math.sqrt(mean_squared_error(test_label,pred_lgbm))
print("The root Mean square error for lightgbm is", lgbm_rmse)





The root Mean square error for lightgbm is 3.951365261437285


# Other Models

# Linear Regression
Linear regression attempts to model the relationship between two variables by fitting a linear equation to observed data.
A linear regression line has an equation of the form Y = a + bX, where X is the explanatory variable and Y is the dependent variable. The slope of the line is b, and a is the intercept (the value of y when x = 0).

Advantages:

Simple and easy to understand.

Cheap computational cost.

Ground for more complex machine learning algorithms.

Disadvantages

Prone to Outliers

Poor performance if data is non-linear


<b> Applying Linear Regression</b>

In [29]:
#create regressor object
linear_reg = LinearRegression(normalize = True)

#fitting the train data
linear_reg.fit(train_data,train_label)

#Predicting test_data and printing the rmse score
lin_reg_pred = linear_reg.predict(test_data)
linear_rmse = math.sqrt(mean_squared_error(test_label,lin_reg_pred))
print('The root mean square error of Linear Regression is', linear_rmse)

The root mean square error of Linear Regression is 5.485041813265747


# DecisionTreeRegressor
Decision trees are predictive models that use a set of binary rules to calculate a target value. Each individual tree is a fairly simple model that has branches, nodes and leaves.

Advantages

It can be used for both Classification and Regression problems
Easy to Understand, Interpret, Visualise

Disadvantages; 
Over fitting: Over fitting is one of the most practical difficulty for decision tree models. This problem gets solved by setting constraints on model parameters and pruning (discussed in detailed below).

Not fit for continuous variables: While working with continuous numerical variables, decision tree looses information when it categorizes variables in different categories.
Cannot extrapolate.

Link to detaled explanation: https://gdcoder.com/decision-tree-regressor-explained-in-depth/

<b> Applying DecisionTreeRegressor </b>

In [30]:
# create a regressor object 
regressor = DecisionTreeRegressor(min_samples_leaf = 6, min_samples_split = 2,splitter = 'best', random_state = 0)  
  
# fit the regressor with X and Y data 
regressor.fit(train_data, train_label) 

#predicting test_data and showing rmse score
y_pred_dec_tree = regressor.predict(test_data)
decisiontree_rmse = math.sqrt(mean_squared_error(test_label , y_pred_dec_tree))
print('The rmse error of Decision tree regressor is ',decisiontree_rmse)

The rmse error of Decision tree regressor is  4.313312548010092


# RandomForest Regressor

Random Forest is an ensemble machine learning technique capable of performing both regression and classification tasks using multiple decision trees and a statistical technique called bagging.Random forest builds multiple decision trees and merge their predictions together to get a more accurate and stable prediction rather than relying on individual decision trees.

Advantages

Reduction in overfitting: by averaging several trees, there is a significantly lower risk of overfitting.

It is very easy to measure the relative importance of each feature on the prediction

Link https://gdcoder.com/random-forest-regressor-explained-in-depth/


<b>Applying RandomForestRegressor on train data</b>

In [31]:
%%time
#Create predictor regressor
rf = RandomForestRegressor(n_estimators = 20, max_depth = 16, max_features = None, oob_score = True, 
                                      bootstrap = True, verbose = 1, n_jobs = -1,random_state=40)

#training data
rf.fit(train_data, train_label)

#predicting result and printing rmse score
rf_predict = rf.predict(test_data)

rf_rmse = math.sqrt(mean_squared_error(test_label,rf_predict))

print("The root mean square error for Random Forest Regressor is ", rf_rmse)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   52.5s finished


The root mean square error for Random Forest Regressor is  3.8761570926860993
Wall time: 55.4 s


  warn("Some inputs do not have OOB scores. "
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.0s finished


# XGBoost

XGBoost is an implementation of Gradient Boosting Machines (GBM) and is used for supervised learning.

The features that standout are Speed, Awareness of sparse data, distributed systems and out-of-core-computation and Parallelization

<b>Applying XGBoost model</b>

In [32]:
dtrain = xgb.DMatrix(train_data, label=train_label)
dtest = xgb.DMatrix(test_data)
#set parameters for xgboost
params = {'max_depth':6,
          'eta':1,
          'silent':1,
          'objective':'reg:linear',
          'eval_metric':'rmse',
          'learning_rate':0.05
         }

num_rounds = 50
#training data
xb = xgb.train(params, dtrain, num_rounds)
#predicting test result
y_pred_xgb = xb.predict(dtest)
xgb_rmse = math.sqrt(mean_squared_error(test_label,y_pred_xgb))
print('The root mean square error of xgBoost is',xgb_rmse)

The root mean square error of xgBoost is 4.089037110077677


In [33]:
all_model_score = pd.DataFrame({ "Algorithms" :['XgBoost','Linear_regression','Decision_tree_reg',
                                                  'Random_forest_reg','(BenchMark Model)Lightgbm'],
                                     "RMSE error": [xgb_rmse,linear_rmse,decisiontree_rmse,rf_rmse,lgbm_rmse]
                                 })
all_model_score

Unnamed: 0,Algorithms,RMSE error
0,XgBoost,4.089037
1,Linear_regression,5.485042
2,Decision_tree_reg,4.313313
3,Random_forest_reg,3.876157
4,(BenchMark Model)Lightgbm,3.951365


Observation: Based on rmse score of all model ,RandomForest outperforms BenchMark Model.But Still i decided to choose Benchmark Model(LightGBM) for tunning it with gridsearchCv to get best parameters.

# Model Refinement


<b>HyperParameter Tunning for BenchMark Model(LightGBM) Using GridSearchCV</b>

In [34]:
%%time
params1 = {
    'max_depth': [5,10,-1],
    'learning_rate': [0.01, 0.03, 0.05],
    'min_child_weight': [1,3,5],
    'reg_lambda': [0.001, 0.002, 0.003],
    'n_estimators':[50,100],
    # fixed params
    'n_jobs': [-1],
    'objective': ['regression'],
    'random_state': [42],
    'min_split_gain':[0.5]
}
clf = lgbm.LGBMRegressor(scoring='rmse')
lgb = GridSearchCV(clf, params1, cv=5, n_jobs=-1)

lgb.fit(train_data, train_label)

Wall time: 42min 42s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda...
                                     subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.03, 0.05],
                         'max_depth': [5, 10, -1],
                         'min_child_weight': [1, 3, 5], 'min_split_gain': [0.5],
                         'n_estimators': [50, 100], 'n_jobs': [-1],
   

In [35]:
#Printing the best Parameters of LightGBM
lgb.best_params_

{'learning_rate': 0.05,
 'max_depth': -1,
 'min_child_weight': 1,
 'min_split_gain': 0.5,
 'n_estimators': 100,
 'n_jobs': -1,
 'objective': 'regression',
 'random_state': 42,
 'reg_lambda': 0.001}

<b>Lightgbm Final model Evaluation with tunned parameters</b>

In [36]:
#defining parameters
params1 = {
    'boosting_type':'gbdt',
    'max_depth': -1,
    'learning_rate': 0.05,
    'min_child_weight': 1,
    'reg_lambda': 0.001,
    'n_jobs': [-1],
    'objective': ['regression'],
    'random_state': [42],
    'min_split_gain': 0.5,
    'min_child_samples': 10
}


train_set = lgbm.Dataset(train_data, train_label, silent=True)
#training data
model = lgbm.train(params1, train_set = train_set, num_boost_round=300)
#Predicting result
tunedlgbm_predict = model.predict(test_data)
tunedlgbm = math.sqrt(mean_squared_error(test_label,tunedlgbm_predict))
print("The root mean square error after using tunned parameters is",tunedlgbm)

The root mean square error after using tunned parameters is 3.8007249025774894


# Conclusion

LightGBM performs better among the other models.

The Tuned LightGBM RMSE score(3.800724) was quite less than other models after tunning its hyperparameters

No. of features used in model is 11

We can Further refine models to get good score 

