In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

import requests
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster
import googlemaps
import time
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import make_column_transformer
from tqdm import tqdm
import mpld3
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import logging
import warnings
from joblib import Memory
import os


In [None]:
taxi = pd.read_csv('curve.csv', low_memory=False)

In [None]:
def safe_msle(y_true, y_pred):
    y_pred = np.clip(y_pred, a_min=0, a_max=None)
    return mean_squared_log_error(y_true, y_pred)

scorer = make_scorer(safe_msle, greater_is_better=False)
X = taxi.drop(['DESTINATIONDATETIME_TR', 'DESTINATIONCITY', 'DESTINATIONSTATE','ORIGINCITY', 
               'ORIGINSTATE','Unnamed: 0', 'DURATION','Duration(m)','datetime','roads','MILEAGE',
               'duration', 'distance', 'steps','precipprob', 'preciptype','time','duration', 'distance', 'steps', 'Time', 'Duration(m)', 'time',
       'snowfall', 'precipitation', 'name', 'datetime', 'precipprob',
       'preciptype', 'snow', 'snowdepth', 'duration(OSRM)', 'distance(OSRM)',
       'steps(OSRM)', 'CenterLat', 'CenterLong', 'Distance', 'roads',
       'Congested_roads', 'Day_Monday', 'Day_Saturday', 'Day_Sunday',
       'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday', 'Steps^2', 'Distance^2',
       'Dir_NorthWest', 'Dir_SouthEast', 'Dir_SouthWest','Unnamed: 0.1',], axis=1)

y = taxi['DURATION']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
##Best Modelws
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")


logging.getLogger("lightgbm").setLevel(logging.ERROR)

num_features = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_features = X_train.select_dtypes(include=["object"]).columns


num_preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_preprocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", max_categories=20))
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_preprocessor, num_features),
        ("cat", cat_preprocessor, cat_features),
    ]
)


memory = Memory(location='cachedir', verbose=0)
os.environ["MallocStackLogging"] = "0"

pipeline = Pipeline([("preprocessor", preprocessor), ("model", Ridge())], memory=memory)

models = {
    'LinearRegression': Pipeline([
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ])
}


param_grids = {
    'LinearRegression': {}  #
}
#LightGBM: RMSLE = 0.0466, Best Params = {'model__feature_fraction': 1.0, 'model__learning_rate': 0.2, 'model__n_estimators': 500, 'model__num_leaves': 100}
#LightGBM: RMSLE = 0.0465, Best Params = {'model__max_depth': 20, 'model__num_leaves': 2000}
grid = GridSearchCV(models['LinearRegression'], param_grids['LinearRegression'], scoring=scorer, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
results = {
        'best_score': np.sqrt(-grid.best_score_),
        'best_params': grid.best_params_,
    }

In [None]:
pred = grid.predict(X_test)

In [None]:
safe_msle(y_test, pred)

In [None]:
jpeg_output_path = 'final_pred.jpeg'

plt.figure(figsize=(6, 4))

sns.histplot(np.exp(pred), kde=True, color='#4394E5', label='Predicted(Model)')
sns.histplot(y_test, kde=True, color='#F5921B', label='Actual')
plt.xlim(0, 3000)
plt.title("Duration Length Distributions")
plt.xlabel("Duration (seconds)")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()  # Adjust layout to ensure everything fits
plt.savefig(jpeg_output_path, format='jpeg')  # Save before showing