In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

# STEP 0: Load your dataset
df = pd.read_csv('train.csv')

# STEP 1: Normalize casing and trim
df['staff_experience'] = df['staff_experience'].str.strip().str.lower()
df['waste_category'] = df['waste_category'].str.strip().str.lower()

# STEP 2: Fill missing values (explicit assignment avoids chained assignment warning)
most_common_experience = df['staff_experience'].mode()[0]
df['staff_experience'] = df['staff_experience'].fillna(most_common_experience)

# STEP 3: Cap extreme temperature values safely
df['temperature_C'] = df['temperature_C'].clip(lower=5, upper=40)

# STEP 4: Ensure numeric types then scale numerical features (with .loc)
numerical_cols = ['meals_served', 'kitchen_staff', 'temperature_C',
                  'humidity_percent', 'past_waste_kg']
df[numerical_cols] = df[numerical_cols].astype(float)

scaler = StandardScaler()
df.loc[:, numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# STEP 5: One-Hot Encode categorical variables (explicit DataFrame assignment)
df = pd.get_dummies(df, columns=['staff_experience', 'waste_category'], drop_first=True)

# STEP 6: Drop irrelevant columns
df = df.drop(columns=['ID', 'date'], errors='ignore')

# Step 7: Separate features and target label
X = df.drop(columns=['food_waste_kg'])
y = df['food_waste_kg']


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Split the data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on validation set
y_pred = rf_model.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")



Mean Absolute Error (MAE): 4.70
R² Score: 0.91


In [3]:
import joblib

# Save model
joblib.dump(rf_model, "model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']