In [None]:
# Importing libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score



In [None]:
# Task 1: Exploratory Data Analysis

# loading data into DataFrame
df = pd.read_csv("/Users/clairestewart/ML-fundamentals-2025/bike+sharing+dataset/hour.csv")
df.head()
print(df.info())


# cnt distribution
plt.figure(figsize=(7, 4))
sns.histplot(df['cnt'], kde=True, bins=30, color='blue')
plt.title('Count Distribution', fontsize=12)
plt.xlabel('# of rentals', fontsize=10)
plt.ylabel('Frequency in dataset', fontsize=10)
plt.show()

# boxplot to see cnt skew
plt.figure(figsize=(7, 4))
sns.boxplot(x=df['cnt'], color='purple')
plt.title('Boxplot of cnt', fontsize=12)
plt.xlabel('# of rentals)', fontsize=10)
plt.show()


# rentals based on holiday 
holiday_avg = df.groupby('holiday')['cnt'].mean().reset_index()
plt.figure(figsize=(7, 4))
sns.barplot(data=holiday_avg, x='holiday', y='cnt')
plt.xticks([0, 1], ['Not Holiday', 'Holiday'])
plt.title('Bike rentals for holidays vs non-holidays')
plt.xlabel('Holiday')
plt.ylabel('# of rentals')
plt.tight_layout()
plt.show()

# rentals based on workday 
workday_avg = df.groupby('workingday')['cnt'].mean().reset_index()
plt.figure(figsize=(7, 4))
sns.barplot(data=workday_avg, x='workingday', y='cnt')
plt.xticks([0, 1], ['Not Workday', 'Working Day'])
plt.title('Bike rentals based on working day')
plt.xlabel('Workday')
plt.ylabel('# of rentals')
plt.tight_layout()
plt.show()

# rentals based on weekday 
days = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
rentals_by_day = df.groupby('weekday')['cnt'].sum()
print("\nRentals based on weekday:")
print('Weekday with the most rentals:', days[rentals_by_day.idxmax()], rentals_by_day.max())
print('Weekday with the least rentals:', days[rentals_by_day.idxmin()], rentals_by_day.min())

# rentals based on weather
# maybe try to print names with the sums
weather = {1: 'Clear, Few clouds, Partly cloudy', 2: 'Mist + cloudy', 3: 'Light snow/rain', 
           4: 'Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog'}
print("\nRentals based on weather situation:")
print(df.groupby('weathersit')['cnt'].sum())

# Checking temp x atemp correlation
print(df[['temp', 'atemp']].corr())
df.drop(columns=['atemp'], inplace=True)

# Checking temp x hum correlation
print(df[['temp', 'hum']].corr())

# Dropping values 
df.drop(columns=['instant', 'casual', 'registered', 'dteday'], inplace=True)



In [None]:
# Task 2: Data Splitting

# come back and sort bins to preserve temporal order?

x = df.drop(columns=['cnt'])
y = df['cnt']

# First split: split into training data and others
x_train, x_other, y_train, y_other = train_test_split(x, y, test_size=0.4, random_state=42)

# Second split: split the others into test and validation sets
x_val, x_test, y_val, y_test = train_test_split(x_other, y_other, test_size=0.5, random_state=42)

# Print sizes of each set to confirm
print("training set:", x_train.shape)
print("validation set:", x_val.shape)
print("test set:", x_test.shape)


In [None]:
# Task 3: Feature Engineering 

# Encoding hr and weekday
x_train['hr_sin'] = np.sin(2 * np.pi * df['hr']/24.0)
x_train['hr_cos'] = np.cos(2 * np.pi * df['hr']/24.0)

x_train['weekday_sin'] = np.sin(2 * np.pi * df['weekday']/7.0)
x_train['weekday_cos'] = np.cos(2 * np.pi * df['weekday']/7.0)

# One hot encoding on season, weathersit, yr, and mnth for training set
encode_cols = ['season', 'weathersit', 'yr','mnth']
x_train = pd.get_dummies(x_train, columns=encode_cols)

# One hot encoding validation and test sets
x_val = pd.get_dummies(x_val, columns=encode_cols)
x_test = pd.get_dummies(x_test, columns=encode_cols)

# Applying Scaler to training set 
scale_cols = ['temp', 'hum', 'windspeed']
scaler = StandardScaler()
x_train[scale_cols] = scaler.fit_transform(x_train[scale_cols])

# Applying scaler to validation and test sets 
x_val[scale_cols] = scaler.transform(x_val[scale_cols])
x_test[scale_cols] = scaler.transform(x_test[scale_cols])




In [None]:
# Task 4: Baseline Model - Linear Regression

baseline_model = LogisticRegression(random_state=42, max_iter=1000)
baseline_model.fit(x_train, y_train)

# predict model with validation set
y_pred = baseline_model.predict(x_val)

# Evaluating Model 
accuracy = accuracy_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred) 
mae = mean_absolute_error(y_val, y_pred) 
r2 = r2_score(y_val, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score", r2)
print("Accuracy:", accuracy)

