In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("DatasetForCoffeeSales2.csv")

# Feature engineering
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Coffee_Bean_Type'] = df['Product']
df['Price_Category'] = df['Unit Price'].apply(lambda x: 'Budget' if x < 32 else 'Standard' if x < 37 else 'Premium' if x < 42 else 'Luxury')
df['Bean_Category'] = df['Unit Price'].apply(lambda x: 'Premium' if x >= 40 else 'Standard')

# Select features and target
X = df[['Unit Price', 'Used_Discount', 'City', 'Coffee_Bean_Type', 'Price_Category', 'Bean_Category', 'Month']]
y = df['Quantity']

# Train/test split
categorical_cols = ['City', 'Coffee_Bean_Type', 'Price_Category', 'Bean_Category', 'Month']
numeric_cols = ['Unit Price', 'Used_Discount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
preprocessor = ColumnTransformer([
    ('num', SimpleImputer(strategy='mean'), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Define models
models = {
    "Linear Regression (sklearn)": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n--- {model_name} ---")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print(f"R²:   {r2_score(y_true, y_pred):.2f}")

# Run and evaluate models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    evaluate_model(y_test, y_pred, name)



--- Linear Regression (sklearn) ---
MAE:  12.91
MSE:  224.64
RMSE: 14.99
R²:   -0.05

--- K-Nearest Neighbors ---
MAE:  13.39
MSE:  249.92
RMSE: 15.81
R²:   -0.17

--- Decision Tree ---
MAE:  16.36
MSE:  400.74
RMSE: 20.02
R²:   -0.87

--- Random Forest ---
MAE:  13.47
MSE:  253.36
RMSE: 15.92
R²:   -0.18
