#### Read Data

In [24]:
import pandas as pd
import numpy as np
data = pd.read_csv('../dataset/dataset_clean.csv')
X = data[['Multiple_deliveries','Distance','Delivery_person_Age','Festival','Road_traffic_density']]
y  = data['Time_taken_(min)']

In [25]:
X.dtypes

Multiple_deliveries     float64
Distance                float64
Delivery_person_Age     float64
Festival                 object
Road_traffic_density     object
dtype: object

#### Encoder object features

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])
        
X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])


Multiple_deliveries     float64
Distance                float64
Delivery_person_Age     float64
Festival                  int32
Road_traffic_density      int32
dtype: object

In [27]:
X.shape,y.shape

((41062, 5), (41062,))

#### Determine number of features 

In [28]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Lasso as LS

In [29]:
sfs1 = SFS(
        LS(), 
        k_features = 5, # select k best feature [1 - X.shape[1]]
        forward=True, 
        floating=False, 
        verbose=2,
        scoring='r2',
        cv=10) # cross-valdation
sfs1.fit(X,y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished

[2022-12-15 14:35:06] Features: 1/5 -- score: 0.10236379515983389[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s finished

[2022-12-15 14:35:06] Features: 2/5 -- score: 0.1892761047692788[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished

[2022-12-15 14:35:06] Features: 3/5 -- score: 0.22518806633131278[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

SequentialFeatureSelector(cv=10, estimator=Lasso(), k_features=(5, 5),
                          scoring='r2', verbose=2)

In [30]:
sfs1.k_feature_names_

('Multiple_deliveries',
 'Distance',
 'Delivery_person_Age',
 'Festival',
 'Road_traffic_density')

In [31]:
def data_selection(*feature):
    df = pd.DataFrame()
    for i in np.array(feature):
        df[i] = X[i]
    return df

In [32]:
X = data_selection(sfs1.k_feature_names_)
X

Unnamed: 0,Multiple_deliveries,Distance,Delivery_person_Age,Festival,Road_traffic_density
0,1.0,3.018911,36.0,0,0
1,1.0,13.973178,37.0,0,3
2,1.0,6.058825,25.0,0,0
3,1.0,19.975520,28.0,0,2
4,1.0,12.435540,22.0,0,1
...,...,...,...,...,...
41057,1.0,1.465159,26.0,0,2
41058,1.0,2.979763,28.0,0,2
41059,1.0,13.894367,30.0,0,2
41060,1.0,10.444972,20.0,0,2


#### Split data

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 43)

In [34]:
X_train.shape,y_train.shape

((32849, 5), (32849,))

In [35]:
X_test.shape,y_test.shape

((8213, 5), (8213,))

### Linear Regression

In [36]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train,y_train)
yHat = LR.predict(X_test)



In [37]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print("MSE:",MSE(y_test,yHat))
print("RMSE:",np.sqrt(MSE(y_test,yHat)))
print("MAE:",MAE(y_test,yHat))
print("R2:",R2(y_test,yHat))

MSE: 57.54926423888306
RMSE: 7.5861231362852966
MAE: 6.10309738908545
R2: 0.35788648937635914


### XGBoostRegression

In [38]:
from xgboost import XGBRegressor
XGB = XGBRegressor()
XGB.fit(X_train,y_train)
yHat = XGB.predict(X_test)

In [39]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print("MSE:",MSE(y_test,yHat))
print("RMSE:",np.sqrt(MSE(y_test,yHat)))
print("MAE:",MAE(y_test,yHat))
print("R2:",R2(y_test,yHat))

MSE: 48.37079026837937
RMSE: 6.954911233680799
MAE: 5.482594829579298
R2: 0.46029652400171606


### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression
LgR = LogisticRegression()
LgR.fit(X_train,y_train)
yHat = LgR.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print("MSE:",MSE(y_test,yHat))
print("RMSE:",np.sqrt(MSE(y_test,yHat)))
print("MAE:",MAE(y_test,yHat))
print("R2:",R2(y_test,yHat))

MSE: 90.99001582856447
RMSE: 9.538868687038546
MAE: 7.3401923779374165
R2: -0.015233109720711457


### Lasso Regression

In [42]:
from sklearn.linear_model import Lasso
LS = Lasso(alpha=0.01)
LS.fit(X_train,y_train)
yHat = LS.predict(X_test)



In [43]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
print("MSE:",MSE(y_test,yHat))
print("RMSE:",np.sqrt(MSE(y_test,yHat)))
print("MAE:",MAE(y_test,yHat))
print("R2:",R2(y_test,yHat))

MSE: 57.562456503157655
RMSE: 7.586992586206847
MAE: 6.105976959732253
R2: 0.35773929494670886
