In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from google.colab import drive
drive.mount('/content/drive')
train_url = "/content/drive/My Drive/Colab Notebooks/train.csv"
test_url = "/content/drive/My Drive/Colab Notebooks/test.csv"

Mounted at /content/drive


In [2]:
train_df =  pd.read_csv(train_url, nrows = 10_000_000)
# train_df =  pd.read_csv(train_url, nrows = 30_000_000)

# train_df =  pd.read_csv(train_url)

train_df.dtypes
test_data=pd.read_csv(test_url)
test_data.dtypes
test_keys = test_data['key']


In [3]:
# Data Preprocessing
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance between two points on the Earth."""
    R = 6371  # Radius of Earth in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))


In [4]:
train_data = train_df
train_data['distance'] = haversine_distance(
    train_data['pickup_latitude'], train_data['pickup_longitude'],
    train_data['dropoff_latitude'], train_data['dropoff_longitude']
)
test_data['distance'] = haversine_distance(
    test_data['pickup_latitude'], test_data['pickup_longitude'],
    test_data['dropoff_latitude'], test_data['dropoff_longitude']
)
for df in [train_data, test_data]:
  print(df.shape)
  print(df.dtypes)

(5000000, 9)
key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object
(9914, 8)
key                   object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object


In [None]:
for df in [train_data, test_data]:
  print(df.shape)
  print(df.dtypes)

(3000000, 9)
key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object
(9914, 8)
key                   object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
distance             float64
dtype: object


In [5]:
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])
test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])

# type(df['pickup_datetime'][0])

In [6]:
for df in [train_data, test_data]:
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

for df in [train_data, test_data]:
  print(df.shape)
  print(df.dtypes)

(5000000, 14)
key                               object
fare_amount                      float64
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
distance                         float64
hour                               int32
day                                int32
month                              int32
year                               int32
day_of_week                        int32
dtype: object
(9914, 13)
key                               object
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
distance                         float64
hour                               int32
day               

In [7]:

# Extract datetime features
for df in [train_data, test_data]:
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

# Remove unnecessary columns
train_data = train_data.drop(['pickup_datetime', 'key'], axis=1, errors='ignore')
test_data = test_data.drop(['pickup_datetime', 'key'], axis=1, errors='ignore')

# Handle missing values
train_data = train_data.dropna()

# Features and target
X = train_data.drop(['fare_amount'], axis=1)
y = train_data['fare_amount']

# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data)

# Train the XGBoost Regressor
model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model.fit(X_train_scaled, y_train)
# 3.38188
# Validate the model
y_pred = model.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")

# Predict on test data
test_predictions = model.predict(test_data_scaled)



Validation RMSE: 4.445067767395486


In [8]:
# Save predictions
submission = pd.DataFrame({'key': test_keys, 'fare_amount': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")

Predictions saved to submission.csv


In [None]:
from catboost import CatBoostRegressor
model2 = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=8, random_seed=42)
model2.fit(X_train_scaled, y_train)
# 3.19832


# Validate the model
y_pred = model2.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")

# Predict on test data
test_predictions2 = model2.predict(test_data_scaled)


0:	learn: 9.1288816	total: 892ms	remaining: 7m 25s
1:	learn: 8.5344178	total: 1.58s	remaining: 6m 32s
2:	learn: 8.0115275	total: 2.23s	remaining: 6m 8s
3:	learn: 7.5534316	total: 2.87s	remaining: 5m 55s
4:	learn: 7.1561867	total: 3.49s	remaining: 5m 45s
5:	learn: 6.8105802	total: 4.15s	remaining: 5m 41s
6:	learn: 6.5096757	total: 4.79s	remaining: 5m 37s
7:	learn: 6.2516908	total: 5.4s	remaining: 5m 32s
8:	learn: 6.0246958	total: 6.02s	remaining: 5m 28s
9:	learn: 5.8362614	total: 6.73s	remaining: 5m 29s
10:	learn: 5.6711196	total: 7.87s	remaining: 5m 49s
11:	learn: 5.5279082	total: 9.06s	remaining: 6m 8s
12:	learn: 5.4083630	total: 10.9s	remaining: 6m 46s
13:	learn: 5.3061252	total: 12.6s	remaining: 7m 17s
14:	learn: 5.2178043	total: 13.7s	remaining: 7m 23s
15:	learn: 5.1431052	total: 15s	remaining: 7m 35s
16:	learn: 5.0759646	total: 16.3s	remaining: 7m 43s
17:	learn: 5.0206093	total: 17.7s	remaining: 7m 53s
18:	learn: 4.9688424	total: 18.8s	remaining: 7m 56s
19:	learn: 4.9265656	total:

In [None]:
# Save predictions
submission = pd.DataFrame({'key': test_keys, 'fare_amount': test_predictions2})
submission.to_csv('submission2.csv', index=False)

print("Predictions saved to submission.csv")

In [9]:
import lightgbm as lgb

# Initialize the LightGBM Regressor with parameters
model3 = lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)

# Fit the model to the training data
# model3.fit(X_train, y_train)
model3.fit(X_train_scaled, y_train)


# Make predictions on the test set
y_pred3 = model3.predict(X_val_scaled)
# score: 3.33979

# Validate the model
# y_pred3 = model3.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, y_pred3))
print(f"Validation RMSE: {rmse}")

# Predict on test data
test_predictions3 = model3.predict(test_data_scaled)


# Evaluate the model's performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_val, y_pred3)
print(f"Mean Squared Error: {mse}")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.585194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 3999971, number of used features: 11
[LightGBM] [Info] Start training from score 11.336551
Validation RMSE: 4.459327522934002
Mean Squared Error: 19.885601956796698


In [10]:
submission = pd.DataFrame({'key': test_keys, 'fare_amount': test_predictions3})
submission.to_csv('submission3.csv', index=False)

print("Predictions saved to submission3.csv")

Predictions saved to submission.csv


In [None]:
# from sklearn.neural_network import MLPRegressor
# from sklearn.metrics import mean_squared_error


# # Initialize the MLP Regressor
# mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# # Fit the model
# mlp_model.fit(X_train_scaled, y_train)

# # Make predictions
# y_pred_mlp = mlp_model.predict(X_val_scaled)

# # Evaluate the model's performance
# mse_mlp = mean_squared_error(y_val, y_pred_mlp)
# print(f"MLP Regressor Mean Squared Error: {mse_mlp}")

# test_predictions_mse_mlp=mse_mlp.predict(test_data_scaled)

# # from catboost import CatBoostRegressor
# # model2 = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=8, random_seed=42)
# # model2.fit(X_train_scaled, y_train)
# # # 3.19832


# # # Validate the model
# # y_pred = model2.predict(X_val_scaled)
# # rmse = np.sqrt(mean_squared_error(y_val, y_pred))
# # print(f"Validation RMSE: {rmse}")

# # # Predict on test data
# # test_predictions2 = model2.predict(test_data_scaled)

In [None]:
# submission = pd.DataFrame({'key': test_keys, 'fare_amount': test_predictions_mse_mlp})
# submission.to_csv('submission_mse_mlp.csv', index=False)

# print("Predictions saved to submission.csv")