In [7]:
%pip install pandas numpy scikit-learn lightgbm

Note: you may need to restart the kernel to use updated packages.


In [15]:
import numpy as np
import pandas as pd
import sys
from lightgbm import LGBMRegressor
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
BASE_DIR = Path("..").resolve()
sys.path.append(str(BASE_DIR))
DATA_DIR = BASE_DIR / "data"
from utils.features import BASE_FEATURES, add_engineered_features
print("Base dir:", BASE_DIR)
print("Data dir:", DATA_DIR)


Base dir: C:\Users\newma\Desktop\Crew_Size_and_Hrs_predcition
Data dir: C:\Users\newma\Desktop\Crew_Size_and_Hrs_predcition\data


In [16]:
def simulate_crew_hours_data(n_samples: int=5000,random_state: int=42)->pd.DataFrame:
    rng=np.random.default_rng(random_state)
    # job size and rooms
    job_size_sqft=rng.integers(300,3500,size=n_samples)
    # number of rooms
    num_rooms=np.clip((job_size_sqft/350+rng.normal(0,1,size=n_samples)).round().astype(int),1,8)

    num_heavy_items=np.clip(rng.poisson(lam=num_rooms*0.8),0,30)

    num_light_items=np.clip(rng.poisson(lam=num_rooms*4.0),5,200)

    # distance
    distance_km = rng.gamma(shape=2.0, scale=5.0, size=n_samples)
    distance_km = np.clip(distance_km, 1, 80)

# building info
    floor_number = rng.integers(0, 12, size=n_samples)
    prob_elevator = np.clip(0.1 + floor_number * 0.08, 0, 0.95)
    has_elevator = rng.binomial(1, prob_elevator).astype(bool)

    # latent complexity
    complexity = (
        0.0015 * job_size_sqft
        + 0.6 * num_rooms
        + 0.7 * num_heavy_items
        + 0.1 * num_light_items
        + 0.5 * np.maximum(floor_number - 2, 0)
        + 0.2 * distance_km
        - 2.0 * has_elevator.astype(int)
    )

    # crew size
    raw_crew = 1.5 + 0.015 * complexity + rng.normal(0, 0.7, size=n_samples)
    crew_size_true = np.clip(raw_crew.round(), 2, 8).astype(int)

    # hours
    raw_hours = 1.5 + 0.05 * complexity + rng.normal(0, 1.5, size=n_samples)
    hours_true = np.clip(raw_hours, 2, 16)

    # past averages
    past_avg_hours = np.clip(
        hours_true + rng.normal(0, 1.0, size=n_samples),
        1.5,
        18
    )

    past_avg_crew_size = np.clip(
        crew_size_true + rng.normal(0, 0.5, size=n_samples),
        1,
        10
    )

    df = pd.DataFrame({
        "job_size_sqft": job_size_sqft,
        "num_rooms": num_rooms,
        "num_heavy_items": num_heavy_items,
        "num_light_items": num_light_items,
        "distance_km": distance_km,
        "floor_number": floor_number,
        "has_elevator": has_elevator,
        "past_avg_hours": past_avg_hours,
        "past_avg_crew_size": past_avg_crew_size,
        "crew_size": crew_size_true,
        "hours_required": hours_true,
    })

    return df
    
    
    

In [17]:
# CELL 3: generate and save data

df = simulate_crew_hours_data(n_samples=5000, random_state=42)

print(df.head())
print(df.describe(include="all"))

csv_path = DATA_DIR / "crew_hours_synthetic.csv"
df.to_csv(csv_path, index=False)
print("Saved dataset to:", csv_path)


   job_size_sqft  num_rooms  num_heavy_items  num_light_items  distance_km  \
0            585          1                2                5    10.779912   
1           2776          7                0               23    11.234749   
2           2394          8                6               39     5.536778   
3           1704          6                3               29    10.749851   
4           1685          4                4                9    17.881982   

   floor_number  has_elevator  past_avg_hours  past_avg_crew_size  crew_size  \
0             9          True        3.672054            1.508863          2   
1             5         False        4.972496            2.442126          2   
2             5          True        2.652201            2.849331          2   
3             0         False        2.855227            1.989930          2   
4             7          True        2.929107            1.860670          2   

   hours_required  
0        2.000000  
1        5

In [18]:
# CELL 4: basic preprocessing and train test split

feature_cols = [
    "job_size_sqft",
    "num_rooms",
    "num_heavy_items",
    "num_light_items",
    "distance_km",
    "floor_number",
    "has_elevator",
    "past_avg_hours",
    "past_avg_crew_size",
]

target_cols = ["crew_size", "hours_required"]

X = df[feature_cols].copy()
y = df[target_cols].copy()

X["has_elevator"] = X["has_elevator"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((4000, 9), (1000, 9), (4000, 2), (1000, 2))

In [19]:
# Apply feature engineering

X_train_fe = add_engineered_features(X_train)
X_test_fe = add_engineered_features(X_test)

print("X_train_fe shape:", X_train_fe.shape)
print("X_test_fe shape:", X_test_fe.shape)
X_train_fe.head()


X_train_fe shape: (4000, 14)
X_test_fe shape: (1000, 14)


Unnamed: 0,job_size_sqft,num_rooms,num_heavy_items,num_light_items,distance_km,floor_number,has_elevator,past_avg_hours,past_avg_crew_size,heavy_items_per_room,light_items_per_room,sqft_per_room,floor_penalty,elevator_relief
4227,2261,5,8,21,5.985698,0,0,1.5,2.585265,1.333333,3.5,376.833333,0,0
4676,2077,4,5,20,6.204235,3,1,1.555932,1.50837,1.0,4.0,415.4,1,1
800,338,1,0,5,3.411652,6,1,1.5,1.8644,0.0,2.5,169.0,4,4
3671,604,1,0,5,12.117298,1,0,1.5,1.71343,0.0,2.5,302.0,0,0
4193,424,2,1,7,5.76193,5,1,1.522349,2.654865,0.333333,2.333333,141.333333,3,3


In [20]:
# two separate regression targets
y_hours_train = y_train["hours_required"]
y_hours_test = y_test["hours_required"]

y_crew_train = y_train["crew_size"]
y_crew_test = y_test["crew_size"]

In [21]:
# LightGBM model for hours prediction
model_hours = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42
)

In [22]:
# LightGBM model for crew size prediction
model_crew = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42
)


In [24]:
print("Training hours model...")
model_hours.fit(X_train_fe, y_hours_train)


Training hours model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1532
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 14
[LightGBM] [Info] Start training from score 2.709609


In [25]:
print("Training crew size model...")
model_crew.fit(X_train_fe, y_crew_train)

Training crew size model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1532
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 14
[LightGBM] [Info] Start training from score 2.139000


In [26]:
# Predict on test set
pred_hours = model_hours.predict(X_test_fe)
pred_crew = model_crew.predict(X_test_fe)

# Metrics for hours
mae_hours = mean_absolute_error(y_hours_test, pred_hours)
rmse_hours = np.sqrt(mean_squared_error(y_hours_test, pred_hours))

# Metrics for crew size
mae_crew = mean_absolute_error(y_crew_test, pred_crew)
rmse_crew = np.sqrt(mean_squared_error(y_crew_test, pred_crew))

print("===== HOURS REQUIRED MODEL =====")
print("MAE :", round(mae_hours, 3))
print("RMSE:", round(rmse_hours, 3))

print("\n===== CREW SIZE MODEL =====")
print("MAE :", round(mae_crew, 3))
print("RMSE:", round(rmse_crew, 3))


===== HOURS REQUIRED MODEL =====
MAE : 0.498
RMSE: 0.675

===== CREW SIZE MODEL =====
MAE : 0.143
RMSE: 0.28


In [28]:
import joblib

joblib.dump(model_hours, "../models/model_hours.pkl")
joblib.dump(model_crew, "../models/model_crew.pkl")

print("Models saved successfully as .pkl files.")


Models saved successfully as .pkl files.
