In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

# Load all CSV files from the directory
base_path = Path("")

tasks = pd.read_csv(base_path / "tasks.csv")
change_orders = pd.read_csv(base_path / "change_orders.csv")
risks = pd.read_csv(base_path / "risk_events.csv")
inspections = pd.read_csv(base_path / "inspection_records.csv")

# Dummy weather dataset (simulate a few rows with necessary columns for modeling)
weather_data = pd.DataFrame({
    "date": ["2025-07-03", "2025-07-10", "2025-07-18", "2025-07-27"],
    "wind_speed_10m_max": [12, 8, 15, 10],
    "wind_gusts_10m_max": [20, 12, 22, 15],
    "uv_index_max": [5, 7, 6, 5],
    "weather_code": [61, 95, 51, 61],  # 61 = rain, 95 = storm, etc.
    "temperature_2m_max": [28, 30, 26, 27],
    "temperature_2m_min": [20, 21, 19, 18],
    "rain_sum": [5, 0, 0, 6],
    "precipitation_sum": [6, 0, 1, 7],
    "precipitation_probability_max": [80, 10, 20, 85],
    "shortwave_radiation_sum": [18, 25, 22, 19]
})

# Merge task data with change orders, risks, and inspections
df = tasks.merge(change_orders, how='left', on='task_id') \
          .merge(risks, how='left', on='task_id') \
          .merge(inspections[['task_id', 'passed', 'date']], how='left', on='task_id') \
          .merge(weather_data, how='left', left_on='date', right_on='date')

# Fill missing values for demonstration purposes
df["added_duration"] = df["added_duration"].fillna(0)
df["cost_impact"] = df["cost_impact"].fillna(0)
df["risk_score"] = df["risk_score"].fillna(0)
df["passed"] = df["passed"].fillna(1)

# Simulate delay as target variable (for training purposes only)
# You could calculate this in real case using actual start and end dates vs plan
df["delay"] = df["added_duration"] + df["risk_score"] * 3 + (1 - df["passed"]) * 2 + (df["precipitation_sum"].fillna(0) > 5).astype(int) * 2

# Encode categorical variables
le_task_type = LabelEncoder()
le_change_type = LabelEncoder()
le_risk_type = LabelEncoder()

df["task_type_enc"] = le_task_type.fit_transform(df["task_type"])
df["change_type_enc"] = le_change_type.fit_transform(df["change_type"].fillna("None"))
df["risk_type_enc"] = le_risk_type.fit_transform(df["risk_type"].fillna("None"))

# Feature set
features = [
    "task_type_enc", "duration_days", "weather_sensitive",
    "added_duration", "cost_impact", "risk_score", "passed",
    "wind_speed_10m_max", "wind_gusts_10m_max", "uv_index_max",
    "temperature_2m_max", "temperature_2m_min", "rain_sum", 
    "precipitation_sum", "precipitation_probability_max", 
    "shortwave_radiation_sum"
]

X = df[features].fillna(0)
y = df["delay"]

# Train/test split and model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict delays on test set
df_test = df.loc[X_test.index].copy()
df_test["predicted_delay"] = model.predict(X_test)

df_test[["task_id", "task_name", "delay", "predicted_delay"]]


Unnamed: 0,task_id,task_name,delay,predicted_delay
7,T8,Ground Floor Walls,0.0,0.63
1,T2,Soil Compaction,0.0,0.676
