In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/bike-sharing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/ooga/.cache/kagglehub/datasets/lakshmi25npathi/bike-sharing-dataset/versions/1


In [9]:
import pandas as pd
df1 = pd.read_csv('/Users/ooga/.cache/kagglehub/datasets/lakshmi25npathi/bike-sharing-dataset/versions/1/day.csv')
df1.head(100)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,2011-04-06,2,0,4,0,3,1,1,0.390833,0.387608,0.470833,0.263063,413,2395,2808
96,97,2011-04-07,2,0,4,0,4,1,1,0.437500,0.433696,0.602917,0.162312,571,2570,3141
97,98,2011-04-08,2,0,4,0,5,1,2,0.335833,0.324479,0.836250,0.226992,172,1299,1471
98,99,2011-04-09,2,0,4,0,6,0,2,0.342500,0.341529,0.877500,0.133083,879,1576,2455


In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as XGBRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
# Since we have a preview of the data, I'll assume it's loaded from a CSV file
# In a real-world scenario, you would need to provide the actual file path
df = pd.read_csv('/home/zua/.cache/kagglehub/datasets/lakshmi25npathi/bike-sharing-dataset/versions/1/day.csv')  # Using the daily aggregated data

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nBasic information about the dataset:")
print(df.info())

print("\nSummary statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Exploratory Data Analysis (EDA)

# 1. Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.histplot(df['cnt'], bins=30, kde=True)
plt.title('Distribution of Total Bike Rentals')
plt.xlabel('Count')
plt.ylabel('Frequency')
plt.show()

# 2. Rental count by season
plt.figure(figsize=(12, 6))
sns.boxplot(x='season', y='cnt', data=df)
plt.title('Bike Rentals by Season')
plt.xlabel('Season (1:Spring, 2:Summer, 3:Fall, 4:Winter)')
plt.ylabel('Count')
plt.show()

# 3. Rental count by weather situation
plt.figure(figsize=(12, 6))
sns.boxplot(x='weathersit', y='cnt', data=df)
plt.title('Bike Rentals by Weather Situation')
plt.xlabel('Weather Situation')
plt.ylabel('Count')
plt.show()

# 4. Rental count by month
plt.figure(figsize=(14, 6))
sns.boxplot(x='mnth', y='cnt', data=df)
plt.title('Bike Rentals by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

# 5. Rental count by workingday
plt.figure(figsize=(10, 6))
sns.boxplot(x='workingday', y='cnt', data=df)
plt.title('Bike Rentals by Working Day')
plt.xlabel('Working Day (0: Weekend/Holiday, 1: Working Day)')
plt.ylabel('Count')
plt.show()

# 6. Correlation analysis
plt.figure(figsize=(14, 10))
# Exclude the 'dteday' column from correlation calculation
correlation_matrix = df.drop(columns=['dteday']).corr()  # Exclude 'dteday'
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# 7. Scatter plots for numerical features vs. target
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']
plt.figure(figsize=(16, 12))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 2, i)
    plt.scatter(df[feature], df['cnt'], alpha=0.5)
    plt.title(f'{feature} vs. cnt')
    plt.xlabel(feature)
    plt.ylabel('cnt')
plt.tight_layout()
plt.show()

# 8. Time series analysis - Rental count over time
df['dteday'] = pd.to_datetime(df['dteday'])
plt.figure(figsize=(16, 6))
plt.plot(df['dteday'], df['cnt'])
plt.title('Bike Rentals Over Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.grid(True)
plt.show()

# Feature Engineering
# Convert categorical variables to dummy variables
df['season'] = df['season'].astype('category')
df['yr'] = df['yr'].astype('category')
df['mnth'] = df['mnth'].astype('category')
df['holiday'] = df['holiday'].astype('category')
df['weekday'] = df['weekday'].astype('category')
df['workingday'] = df['workingday'].astype('category')
df['weathersit'] = df['weathersit'].astype('category')

# Create dummies for categorical variables
df_encoded = pd.get_dummies(df, columns=['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], drop_first=True)

# Prepare data for modeling
X = df_encoded.drop(['instant', 'dteday', 'casual', 'registered', 'cnt'], axis=1)
y = df_encoded['cnt']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Building and Evaluation
# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)

# 2. Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)

# 3. Gradient Boosting Regressor
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_r2 = r2_score(y_test, gb_pred)
gb_mae = mean_absolute_error(y_test, gb_pred)
# 4. XGBoost
from xgboost import XGBRegressor # Import the XGBRegressor class from the xgboost module
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_mae = mean_absolute_error(y_test, xgb_pred)

# Compare model performance
models = ['Linear Regression', 'Random Forest', 'Gradient Boosting', 'XGBoost']
rmse_scores = [lr_rmse, rf_rmse, gb_rmse, xgb_rmse]
r2_scores = [lr_r2, rf_r2, gb_r2, xgb_r2]
mae_scores = [lr_mae, rf_mae, gb_mae, xgb_mae]

# Create a DataFrame to compare model performance
model_comparison = pd.DataFrame({
    'Model': models,
    'RMSE': rmse_scores,
    'R-squared': r2_scores,
    'MAE': mae_scores
})
print("\nModel Comparison:")
print(model_comparison.sort_values('RMSE'))

# Feature importance for the best model (assuming Random Forest or Gradient Boosting performs best)
best_model = gb  # Can be updated based on results
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Importance')
plt.tight_layout()
plt.show()

# Hyperparameter tuning for the best model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)

# Train the model with the best parameters
best_gb = grid_search.best_estimator_
best_gb.fit(X_train, y_train)
best_gb_pred = best_gb.predict(X_test)
best_gb_rmse = np.sqrt(mean_squared_error(y_test, best_gb_pred))
best_gb_r2 = r2_score(y_test, best_gb_pred)
best_gb_mae = mean_absolute_error(y_test, best_gb_pred)

print("\nBest Model (Tuned Gradient Boosting) Performance:")
print(f"RMSE: {best_gb_rmse:.2f}")
print(f"R-squared: {best_gb_r2:.4f}")
print(f"MAE: {best_gb_mae:.2f}")

# Visualize actual vs predicted values
plt.figure(figsize=(12, 6))
plt.scatter(y_test, best_gb_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Bike Rentals')
plt.show()

# Visualize residuals
residuals = y_test - best_gb_pred
plt.figure(figsize=(12, 6))
plt.scatter(best_gb_pred, residuals, alpha=0.5)
plt.hlines(y=0, xmin=residuals.min(), xmax=residuals.max(), colors='r', linestyles='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Distribution of residuals
plt.figure(figsize=(12, 6))
sns.histplot(residuals, bins=30, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.show()

# Print final conclusions
print("\nFinal Model Evaluation:")
print("-----------------------")
print(f"The best performing model is the tuned Gradient Boosting Regressor with an RMSE of {best_gb_rmse:.2f}.")
print(f"This model explains {best_gb_r2*100:.2f}% of the variance in bike rental counts.")
print("\nMost important features determining bike rentals:")
for i, (feature, importance) in enumerate(zip(feature_importance['Feature'].head(5), feature_importance['Importance'].head(5))):
    print(f"{i+1}. {feature}: {importance:.4f}")


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/ooga/venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <98D50080-9632-3EA4-B874-146E55453763> /Users/ooga/venv/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file)"]
