In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split


In [5]:
nyc_taxi_data = pd.read_csv('/content/drive/MyDrive/CMPE_255_DataPrep/Task3/nyc_taxi.csv')

In [6]:
nyc_taxi_data

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.756680,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.986160,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.956070,40.771900,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.961510,40.755890,N
...,...,...,...,...,...,...,...,...,...
625129,id3008929,1,2016-01-01 00:02:52,1,-74.003464,40.725105,-74.001251,40.733643,N
625130,id3700764,1,2016-01-01 00:01:52,1,-74.006363,40.743782,-73.953407,40.782467,N
625131,id2568735,1,2016-01-01 00:01:24,2,-73.972267,40.759865,-73.876602,40.748665,N
625132,id1384355,1,2016-01-01 00:00:28,1,-73.976501,40.733562,-73.854263,40.891788,N


In [16]:
# Generate random fare_amount values for the dataset within the range $3 to $60
np.random.seed(0)  # for reproducibility
nyc_taxi_data['fare_amount'] = np.random.uniform(3, 60, nyc_taxi_data.shape[0])

# Display the first few rows to confirm the appended fare_amount values
nyc_taxi_data[['fare_amount']].head()


Unnamed: 0,fare_amount
0,34.28237
1,43.765794
2,37.357512
3,34.058341
4,27.148324


In [17]:
nyc_taxi_data

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,fare_amount
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.756680,N,34.282370
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,43.765794
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.986160,40.729523,N,37.357512
3,id2150126,2,2016-06-30 23:59:41,1,-73.956070,40.771900,-73.986427,40.730469,N,34.058341
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.961510,40.755890,N,27.148324
...,...,...,...,...,...,...,...,...,...,...
625129,id3008929,1,2016-01-01 00:02:52,1,-74.003464,40.725105,-74.001251,40.733643,N,41.546294
625130,id3700764,1,2016-01-01 00:01:52,1,-74.006363,40.743782,-73.953407,40.782467,N,24.568487
625131,id2568735,1,2016-01-01 00:01:24,2,-73.972267,40.759865,-73.876602,40.748665,N,48.515626
625132,id1384355,1,2016-01-01 00:00:28,1,-73.976501,40.733562,-73.854263,40.891788,N,38.936262


In [18]:
# Convert 'pickup_datetime' to datetime format
nyc_taxi_data['pickup_datetime'] = pd.to_datetime(nyc_taxi_data['pickup_datetime'])


In [19]:
# Define NYC latitude and longitude bounds
lat_bounds = (40.5, 40.9)
lon_bounds = (-74.3, -73.7)

In [20]:
# Filter the dataset for valid coordinates
filtered_nyc_taxi_data = nyc_taxi_data[
    (nyc_taxi_data['pickup_latitude'].between(*lat_bounds)) &
    (nyc_taxi_data['pickup_longitude'].between(*lon_bounds)) &
    (nyc_taxi_data['dropoff_latitude'].between(*lat_bounds)) &
    (nyc_taxi_data['dropoff_longitude'].between(*lon_bounds))
]

In [21]:
# Remove rows with zero passengers
filtered_nyc_taxi_data = filtered_nyc_taxi_data[filtered_nyc_taxi_data['passenger_count'] != 0]


In [22]:
# Extract hour, day of the week, and month from 'pickup_datetime'
filtered_nyc_taxi_data['pickup_hour'] = filtered_nyc_taxi_data['pickup_datetime'].dt.hour
filtered_nyc_taxi_data['pickup_dayofweek'] = filtered_nyc_taxi_data['pickup_datetime'].dt.dayofweek
filtered_nyc_taxi_data['pickup_month'] = filtered_nyc_taxi_data['pickup_datetime'].dt.month


In [23]:
# Calculate Haversine distance
def haversine_distance(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371
    return c * r

In [24]:
filtered_nyc_taxi_data['trip_distance_km'] = haversine_distance(
    filtered_nyc_taxi_data['pickup_longitude'], filtered_nyc_taxi_data['pickup_latitude'],
    filtered_nyc_taxi_data['dropoff_longitude'], filtered_nyc_taxi_data['dropoff_latitude']
)


In [25]:
# One-hot encode 'store_and_fwd_flag'
filtered_nyc_taxi_data = pd.get_dummies(filtered_nyc_taxi_data, columns=['store_and_fwd_flag'], drop_first=True)


In [26]:
# Split data for modeling
X = filtered_nyc_taxi_data.drop(columns=['fare_amount', 'pickup_datetime'])
y = filtered_nyc_taxi_data['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [27]:
pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.2.1-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.2.1


In [28]:
import sweetviz as sv

In [29]:
report = sv.analyze(filtered_nyc_taxi_data)
report.show_html('/content/drive/MyDrive/CMPE_255_DataPrep/Task3/sweetviz_report.html', open_browser=True, layout='widescreen', scale=1.0)

                                             |          | [  0%]   00:00 -> (? left)

Report /content/drive/MyDrive/CMPE_255_DataPrep/Task3/sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [31]:
pip install h2o

Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=90309d475b2b009076d871302cd0163be41491b81762a7d4c8ec12cc2757ef78
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [32]:
import h2o
h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp64lj85nn
  JVM stdout: /tmp/tmp64lj85nn/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp64lj85nn/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_unknownUser_l6at9h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [33]:
h2o_data = h2o.H2OFrame(filtered_nyc_taxi_data)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [35]:
y = "fare_amount"
x = h2o_data.columns
x.remove(y)


In [36]:
from h2o.automl import H2OAutoML

automl = H2OAutoML(max_models=5, seed=1, max_runtime_secs=500)
automl.train(x=x, y=y, training_frame=h2o_data)

AutoML progress: |
10:21:06.680: _train param, Dropping bad and constant columns: [id]

███████████
10:22:27.600: _train param, Dropping bad and constant columns: [id]

█
10:22:31.368: _train param, Dropping bad and constant columns: [id]

█
10:22:43.731: _train param, Dropping bad and constant columns: [id]

█████
10:23:26.425: _train param, Dropping bad and constant columns: [id]

████████
10:24:30.416: _train param, Dropping unused columns: [id]

██
10:24:41.691: _train param, Dropping unused columns: [id]

███████████████████████████████████| (done) 100%


key,value
Stacking strategy,blending
Number of base models (used / total),0/5
# GBM base models (used / total),0/1
# XGBoost base models (used / total),0/2
# GLM base models (used / total),0/1
# DRF base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,AUTO
Metalearner nfolds,0
Metalearner fold_column,


In [37]:
lb = automl.leaderboard
print(lb.head())

# Best model
ensemble_model = automl.leader


model_id                                                    rmse      mse      mae     rmsle    mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_1_20231029_102103     16.4107  269.312  14.2063  0.68642                    269.312
StackedEnsemble_BestOfFamily_1_AutoML_1_20231029_102103  16.4107  269.312  14.2063  0.68642                    269.312
GLM_1_AutoML_1_20231029_102103                           16.4108  269.315  14.2064  0.686566                   269.315
GBM_1_AutoML_1_20231029_102103                           16.4124  269.367  14.2055  0.686564                   269.367
DRF_1_AutoML_1_20231029_102103                           16.4303  269.956  14.217   0.68683                    269.956
XGBoost_2_AutoML_1_20231029_102103                       16.5136  272.698  14.2581  0.688483                   272.698
XGBoost_1_AutoML_1_20231029_102103                       16.7578  280.823  14.4018  0.693814                   280.823
[7 rows x 6 columns]



In [38]:
h2o.cluster().shutdown()


H2O session _sid_bebf closed.
