In [5]:
import pandas as pd
import numpy as np
import zipfile
import requests
import io

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
df = pd.read_csv(z.open("day.csv"))
df = df.drop(['instant', 'dteday', 'casual', 'registered'], axis=1)

X = df.drop('cnt', axis=1)
y = pd.qcut(df['cnt'], q=4, labels=['low', 'medium', 'high', 'very_high']) 
categorical_cols = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_cols = ['temp', 'atemp', 'hum', 'windspeed']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

sample_input = X.iloc[:5]
print("\nPipeline handles raw data automatically. Prediction:\n", pipeline.predict(sample_input))


Accuracy: 0.8435374149659864

Classification Report:
               precision    recall  f1-score   support

        high       0.77      0.84      0.81        32
         low       0.95      0.83      0.89        48
      medium       0.74      0.79      0.76        33
   very_high       0.89      0.91      0.90        34

    accuracy                           0.84       147
   macro avg       0.84      0.84      0.84       147
weighted avg       0.85      0.84      0.85       147


Pipeline handles raw data automatically. Prediction:
 ['low' 'low' 'low' 'low' 'low']
