In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import OneHotEncoder , LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [14]:
#For Rgression types of Prolem
from prettytable import PrettyTable
def print_stats_metrices_regression(algorithm_name, r2, variance, mse, max_error):
    x = PrettyTable()
    x.field_names = ["Algorithm","R2", "Variance", "RMSE" , "Max_Error"]    
    x.add_row([algorithm_name,r2,variance,mse,max_error]);
    print(x)
    


In [7]:
df_original=pd.read_csv("yellow_tripdata_2009-01.csv",nrows=250000)
df_modified=df_original.drop(['Rate_Code','store_and_forward','surcharge', 'mta_tax', 'Tip_Amt', 'Tolls_Amt',
       'Total_Amt'],axis=1)

df_modified=df_modified[(df_modified["Passenger_Count"]> 0) & (df_modified["Passenger_Count"]< 6)]
df_modified=df_modified[df_modified["Trip_Distance"] >= 1]
df_modified=df_modified[df_modified["Fare_Amt"]<=50]

df_modified=df_modified[(df_modified.Start_Lat <= np.percentile(df_modified.Start_Lat,98)) & (df_modified.Start_Lat >= np.percentile(df_modified.Start_Lat,2))]
df_modified=df_modified[(df_modified.Start_Lon <= np.percentile(df_modified.Start_Lon,98)) & (df_modified.Start_Lon >= np.percentile(df_modified.Start_Lon,2))]
df_modified=df_modified[(df_modified.End_Lat <= np.percentile(df_modified.End_Lat,98)) & (df_modified.End_Lat >= np.percentile(df_modified.End_Lat,2))]
df_modified=df_modified[(df_modified.End_Lon <= np.percentile(df_modified.End_Lon,98)) & (df_modified.End_Lon >= np.percentile(df_modified.End_Lon,2))]


df_modified['Hour']=pd.to_datetime(df_modified['Trip_Pickup_DateTime']).dt.hour
df_modified['Weekday']=pd.to_datetime(df_modified['Trip_Pickup_DateTime']).dt.weekday


In [8]:
preprocess = make_column_transformer(
    (['Trip_Distance', 'Hour'], MinMaxScaler()),
    (['vendor_name', 'Payment_Type'], OneHotEncoder())
)


X=df_modified[['Trip_Distance', 'Hour','vendor_name', 'Payment_Type','Passenger_Count']]
Y=df_modified.Fare_Amt




In [28]:
#Linear Regression
linReg=LinearRegression()
pipe_linear=make_pipeline(preprocess,linReg)

r2=cross_val_score(pipe_linear,X,Y, cv=5,scoring='r2').mean()
variance=cross_val_score(pipe_linear,X,Y, cv=5,scoring='explained_variance').mean()
mse=cross_val_score(pipe_linear,X,Y, cv=5,scoring='neg_mean_squared_error').mean()
max_error=cross_val_score(pipe_linear,X,Y, cv=5,scoring='max_error').mean()

print_stats_metrices_regression("Linear_Regression",r2,variance,mse,max_error)

#Ridge
ridge=Ridge(alpha=0.01)
pipe_linear=make_pipeline(preprocess,ridge)

r2=cross_val_score(pipe_linear,X,Y, cv=5,scoring='r2').mean()
variance=cross_val_score(pipe_linear,X,Y, cv=5,scoring='explained_variance').mean()
mse=cross_val_score(pipe_linear,X,Y, cv=5,scoring='neg_mean_squared_error').mean()
max_error=cross_val_score(pipe_linear,X,Y, cv=5,scoring='max_error').mean()

print_stats_metrices_regression("Ridge_Regression",r2,variance,mse,max_error)


laso=Lasso(alpha=0.01, max_iter=100)
pipe_linear=make_pipeline(preprocess,laso)

r2=cross_val_score(pipe_linear,X,Y, cv=5,scoring='r2').mean()
variance=cross_val_score(pipe_linear,X,Y, cv=5,scoring='explained_variance').mean()
mse=cross_val_score(pipe_linear,X,Y, cv=5,scoring='neg_mean_squared_error').mean()
max_error=cross_val_score(pipe_linear,X,Y, cv=5,scoring='max_error').mean()

print_stats_metrices_regression("Lasso_Regression",r2,variance,mse,max_error)


ele=ElasticNet(alpha=0.01)
pipe_linear=make_pipeline(preprocess,ele)

r2=cross_val_score(pipe_linear,X,Y, cv=5,scoring='r2').mean()
variance=cross_val_score(pipe_linear,X,Y, cv=5,scoring='explained_variance').mean()
mse=cross_val_score(pipe_linear,X,Y, cv=5,scoring='neg_mean_squared_error').mean()
max_error=cross_val_score(pipe_linear,X,Y, cv=5,scoring='max_error').mean()

print_stats_metrices_regression("Elastic_Regression",r2,variance,mse,max_error)






+-------------------+--------------------+-------------------+---------------------+-------------------+
|     Algorithm     |         R2         |      Variance     |         RMSE        |     Max_Error     |
+-------------------+--------------------+-------------------+---------------------+-------------------+
| Linear_Regression | 0.8449891893791743 | 0.844992418472804 | -2.4626703567147805 | -85.9236753074216 |
+-------------------+--------------------+-------------------+---------------------+-------------------+
+------------------+--------------------+--------------------+---------------------+--------------------+
|    Algorithm     |         R2         |      Variance      |         RMSE        |     Max_Error      |
+------------------+--------------------+--------------------+---------------------+--------------------+
| Ridge_Regression | 0.8449892359219338 | 0.8449924655204522 | -2.4626697337560826 | -85.91894636672876 |
+------------------+--------------------+----------