In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
pd.__version__

'1.4.2'

In [3]:
!pip install pyarrow



In [4]:
df_jan23 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [5]:
df_feb23 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [6]:
df_jan23.shape

(3066766, 19)

In [7]:
print(df_jan23.isnull().sum())

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71743
trip_distance                0
RatecodeID               71743
store_and_fwd_flag       71743
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71743
airport_fee              71743
dtype: int64


In [8]:
numeric_cols = df_jan23.select_dtypes(include=[np.number]).columns

# Check for NaNs in numeric columns
print(df_jan23[numeric_cols].isnull().sum())

# Replace NaNs with a specific value (e.g., 0)
df_jan23[numeric_cols] = df_jan23[numeric_cols].fillna(0)

# Check for infinity or excessively large values
print(np.isfinite(df_jan23[numeric_cols]).all())

# Replace infinities with a specific value (e.g., max finite value)
df_jan23[numeric_cols] = df_jan23[numeric_cols].replace([np.inf, -np.inf], 0)



VendorID                     0
passenger_count          71743
trip_distance                0
RatecodeID               71743
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71743
airport_fee              71743
dtype: int64
VendorID                 True
passenger_count          True
trip_distance            True
RatecodeID               True
PULocationID             True
DOLocationID             True
payment_type             True
fare_amount              True
extra                    True
mta_tax                  True
tip_amount               True
tolls_amount             True
improvement_surcharge    True
total_amount             True
congestion_surcharge     True
airport_fee              True
dtype: bool

In [9]:
df_jan23['tpep_pickup_datetime'] = pd.to_datetime(df_jan23['tpep_pickup_datetime'])
df_jan23['tpep_dropoff_datetime'] = pd.to_datetime(df_jan23['tpep_dropoff_datetime'])

df_jan23['Trip_Duration'] = (df_jan23['tpep_pickup_datetime'] - df_jan23['tpep_dropoff_datetime']).dt.total_seconds() / 60
df_jan23['Trip_Duration'].std()

df_jan23['tpep_pickup_datetime'] = pd.to_datetime(df_jan23['tpep_pickup_datetime']).astype(str)
df_jan23['tpep_dropoff_datetime'] = pd.to_datetime(df_jan23['tpep_dropoff_datetime']).astype(str)

In [10]:
filtered_df = df_jan23[(df_jan23['Trip_Duration'] >= 1) & (df_jan23['Trip_Duration'] <= 60)]
fraction_left = len(filtered_df) / len(df_jan23)
print(fraction_left)

9.782291834460144e-07


In [11]:
# Convert the location IDs to strings
df_jan23['PULocationID'] = df_jan23['PULocationID'].astype(str)
df_jan23['DOLocationID'] = df_jan23['DOLocationID'].astype(str)


In [12]:
# Ensure there are no NaNs or infinite values in the numeric columns
df_jan23 = df_jan23.fillna(0)
df_jan23 = df_jan23.replace([np.inf, -np.inf], 0)

# Work with a smaller subset of the data
df_jan23_subset = df_jan23.sample(n=1000, random_state=1)
# Turn the DataFrame into a list of dictionaries
list_of_dicts = df_jan23_subset.to_dict(orient='records')

In [13]:
# Fit a dictionary vectorizer
dv = DictVectorizer(sparse=False)
feature_matrix = dv.fit_transform(list_of_dicts)

In [14]:
# Determine the dimensionality of the feature matrix
dimensionality = feature_matrix.shape[1]

# Display the feature names and the dimensionality
print("Feature names:", dv.get_feature_names_out())
print("Dimensionality (number of columns):", dimensionality)

Feature names: ['DOLocationID=1' 'DOLocationID=10' 'DOLocationID=100' ...
 'tpep_pickup_datetime=2023-01-31 22:01:01'
 'tpep_pickup_datetime=2023-01-31 22:04:33' 'trip_distance']
Dimensionality (number of columns): 2200


In [15]:
# Check the size of the feature matrix and target variable
print("Feature matrix size:", feature_matrix.shape)
print("Target variable size:", df_jan23['Trip_Duration'].shape)

# Use a much smaller subset of the data for testing
subset_size = 1000  # Reduce the subset size for initial testing
X_train = feature_matrix[:subset_size]
y_train = df_jan23['Trip_Duration'][:subset_size]

# Check for any NaNs or infinite values
print("Any NaNs in X_train:", np.isnan(X_train).any())
print("Any NaNs in y_train:", np.isnan(y_train).any())
print("Any infinities in X_train:", np.isinf(X_train).any())
print("Any infinities in y_train:", np.isinf(y_train).any())

Feature matrix size: (1000, 2200)
Target variable size: (3066766,)
Any NaNs in X_train: False
Any NaNs in y_train: False
Any infinities in X_train: False
Any infinities in y_train: False


In [16]:

# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the training data
y_train_pred = model.predict(X_train)


In [17]:

# Calculate RMSE on training data
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("RMSE on train:", rmse_train)

RMSE on train: 9.736102724559258e-12


In [None]:
# Preprocess the training data
df_jan23 = preprocess(df_jan23)


In [None]:

# Convert the DataFrame to a list of dictionaries for training data
list_of_dicts_train = df_jan23[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [None]:
# Fit the DictVectorizer on training data
dv = DictVectorizer(sparse=False)


In [None]:
feature_matrix_train = dv.fit_transform(list_of_dicts_train)

In [None]:
# Training data
X_train = feature_matrix_train
y_train = df_jan23['Trip_Duration']


In [None]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Preprocess the validation data
df_feb23 = preprocess(df_feb23)




In [None]:
# Convert the DataFrame to a list of dictionaries for validation data
list_of_dicts_val = df_feb23[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Transform the validation data using the already fitted DictVectorizer
feature_matrix_val = dv.transform(list_of_dicts_val)

