In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from dask.diagnostics import ProgressBar

# Load the dataset
file_path ="/content/drive/MyDrive/1_ord_lax_sorted_wDaysBeforeDeparture_bestTimePurchase_day_month_wDayOfWeek_wHolidays_wDurationMinutes_wCount_time_filterd_14_layovers.csv"

df = pd.read_csv(file_path)
print("Number of entries (rows):", df.shape[0])

columns_to_keep = [
    'daysBeforeFlight',
    'flightDay', 'flightMonth',
    'flightDayOfWeek', 'nearHoliday', 'bestTimeToPurchase',
    'departureHour', 'departureMinute', 'arrivalMinute', 'arrivalHour', 'travelDurationMinutes'
]

df_filtered = df[columns_to_keep]

# Sample 10% of the dataset
df_sampled = df_filtered.sample(frac=0.3, random_state = 42)
print("Number of entries (rows) after sampling 10%:", df_sampled.shape[0])

# Define features and target variable
X = df_sampled.drop('bestTimeToPurchase', axis=1)
y = df_sampled['bestTimeToPurchase']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_features': [2, 3, 4],
    'max_depth': [20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_samples': [0.5, 0.7]
}

# Initialize the model
rf_regressor = RandomForestRegressor()

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_dist, cv=5, scoring='neg_mean_squared_error',
                           n_jobs=-1, verbose=1)

# Fit Grid Search with a progress bar using dask ProgressBar
with ProgressBar():
    grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_rf_regressor = grid_search.best_estimator_
best_rf_regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = best_rf_regressor.predict(X_test)

# Calculate and print RMSE for the test data
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE on Test Data: {rmse}")

# Perform cross-validation with 10 folds on the best model
cv_scores = cross_val_score(best_rf_regressor, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert negative mean squared error to positive
cv_scores = -cv_scores

# Calculate RMSE for each fold
rmse_scores = [sqrt(score) for score in cv_scores]

print(f"Cross-Validation RMSE Scores: {rmse_scores}")
print(f"Mean RMSE: {sum(rmse_scores) / len(rmse_scores)}")
print(f"Standard Deviation of RMSE: {pd.Series(rmse_scores).std()}")

  df = pd.read_csv(file_path)


Number of entries (rows): 479857
Number of entries (rows) after sampling 10%: 143957
Fitting 5 folds for each of 972 candidates, totalling 4860 fits




KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from dask.diagnostics import ProgressBar

# Load the dataset
file_path ="/content/drive/MyDrive/1_ord_lax_sorted_wDaysBeforeDeparture_bestTimePurchase_day_month_wDayOfWeek_wHolidays_wDurationMinutes_wCount_time_filterd_14_layovers.csv"

df = pd.read_csv(file_path)
print("Number of entries (rows):", df.shape[0])

columns_to_keep = [
    'daysBeforeFlight',
    'flightDay', 'flightMonth',
    'flightDayOfWeek', 'nearHoliday', 'bestTimeToPurchase',
    'departureHour', 'departureMinute', 'arrivalMinute', 'arrivalHour', 'travelDurationMinutes'
]

df_filtered = df[columns_to_keep]

# Sample 30% of the dataset
df_sampled = df_filtered.sample(frac=0.3, random_state=42)
print("Number of entries (rows) after sampling 30%:", df_sampled.shape[0])

# Define features and target variable
X = df_sampled.drop('bestTimeToPurchase', axis=1)
y = df_sampled['bestTimeToPurchase']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_features': [2, 3, 4],
    'max_depth': [20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_samples': [0.5, 0.7]
}

# Initialize the model
rf_regressor = RandomForestRegressor()

# Initialize Randomized Search
random_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_dist, cv=5, scoring='neg_mean_squared_error',
                                   n_jobs=-1, verbose=1, n_iter=100, random_state=42)

# Fit Randomized Search with a progress bar using dask ProgressBar
with ProgressBar():
    random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the model with the best parameters
best_rf_regressor = random_search.best_estimator_
best_rf_regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = best_rf_regressor.predict(X_test)

# Calculate and print RMSE for the test data
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE on Test Data: {rmse}")

# Perform cross-validation with 10 folds on the best model
cv_scores = cross_val_score(best_rf_regressor, X, y, cv=10, scoring='neg_mean_squared_error')

# Convert negative mean squared error to positive
cv_scores = -cv_scores

# Calculate RMSE for each fold
rmse_scores = [sqrt(score) for score in cv_scores]

print(f"Cross-Validation RMSE Scores: {rmse_scores}")
print(f"Mean RMSE: {sum(rmse_scores) / len(rmse_scores)}")
print(f"Standard Deviation of RMSE: {pd.Series(rmse_scores).std()}")


Best Parameters: {'bootstrap': True, 'max_depth': 40, 'max_features': 2, 'max_samples': 0.7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RMSE on Test Data: 4.378777304035373
Cross-Validation RMSE Scores: [4.1457766546778, 4.179370658340349, 4.167067577173025, 4.174058025015905, 4.080473281385968, 4.1492263693656435, 4.085152835497481, 4.072493006920212, 4.292280010709814, 3.923740294907137]
Mean RMSE: 4.126963871399334
Standard Deviation of RMSE: 0.0959907623960586

Process finished with exit code 0