Reading Task Schedule

In [1]:
import pandas as pd

# Read CSV files
tasks_df = pd.read_csv('data/tasks.csv')
change_orders_df = pd.read_csv('data/change_orders.csv')
risk_events_df = pd.read_csv('data/risk_events.csv')
inspections_df = pd.read_csv('data/inspection_records.csv')
weather_history_df = pd.read_csv('data/weather_history.csv')
weather_history_2024_df = pd.read_csv('data/weather_history_2024.csv')
# Display the first few rows of each for confirmation
# print("Tasks Data:")
# print(tasks_df.head(), "\n")

# print("Change Orders Data:")
# print(change_orders_df.head(), "\n")

# print("Risk Events Data:")
# print(risk_events_df.head(), "\n")

# print("Inspection Records Data:")
# print(inspections_df.head(), "\n") 

# print("Weather History Data:")
# print(weather_history_df.head(), "\n")


In [2]:
# Compute priority score
tasks_df['priority_score'] = tasks_df['duration_days'] + tasks_df['type_weight'] + tasks_df['weather_weight']

# Sort tasks by priority (descending)
# df_sorted = tasks_df.sort_values(by='priority_score', ascending=False)

# Show the priority list
print(tasks_df[['task_id', 'task_name', 'priority_score']])

  task_id               task_name  priority_score
0      T1              Excavation              15
1      T2         Soil Compaction              13
2      T3         Foundation Pour              18
3      T4  Basement Waterproofing              12
4      T5             Backfilling              11
5      T6            Slab Casting              13
6      T7     Ground Floor Column               8
7      T8      Ground Floor Walls               9
8      T9               Roof Slab              10


In [2]:
from datetime import timedelta

# Set project_start if not already defined
project_start = pd.to_datetime(tasks_df['start_date']).min()

# Step 1: Load CSVs
changes_df = change_orders_df

# Step 2: Apply changes based on change orders
for _, change in changes_df.iterrows():
    task_id = change['task_id']
    added_duration = change['added_duration']

    # Check if task exists
    if task_id in tasks_df['task_id'].values:
        # Update the duration
        tasks_df.loc[tasks_df['task_id'] == task_id, 'duration_days'] += added_duration

        # Update the cost impact
        tasks_df.loc[tasks_df['task_id'] == task_id, 'actual_cost'] += change['cost_impact']

        # Optional: mark that a change has been applied
        tasks_df.loc[tasks_df['task_id'] == task_id, 'change_applied'] = change['change_id']
        print(f"✅ Change order {change['change_id']} applied to task {task_id}. Duration increased by {added_duration} days.")
    else:
        print(f"Warning: Task {task_id} in change order not found in tasks list.")
#
#
#
#
#
#
# Function to compute start and end dates considering dependencies
def compute_schedule(df, start_date):
    schedule = {}
    for idx, row in df.iterrows():
        dep = row['depends_on']
        duration = row['duration_days']
        risk_score = None

        print(f"Processing task {row['task_id']} with duration {duration} days and dependency {dep}")
        
        # Apply risk score impact
        risk = risk_events_df[risk_events_df['task_id'] == row['task_id']]
        if not risk.empty:
            risk_score = float(risk.iloc[0]['risk_score'])
            if risk_score >= 0.7:
                duration += 3
            elif risk_score >= 0.5:
                duration += 2
            elif risk_score >= 0.3:
                duration += 1

        print(f"Adjusted duration for task {row['task_id']} is now {duration} days due to risk score impact of {risk_score}")

        # Apply inspection failures
        inspection = inspections_df[(inspections_df['task_id'] == row['task_id']) & (inspections_df['passed'] == 0)]
        if not inspection.empty:
            duration += 2  # inspection failed

        print(f"Adjusted duration for task {row['task_id']} is now {duration} days due to inspection failures")
        print("-------------------------------------")

        if pd.isna(dep):
            s_date = start_date
        else:
            s_date = schedule[dep]['end_date'] + timedelta(days=1)
        e_date = s_date + timedelta(days=duration - 1)
        schedule[row['task_id']] = {
            'start_date': s_date,
            'end_date': e_date,
            'risk_score': risk_score,
            'adjusted_duration': duration
        }
    return schedule
#
#
#
#
#
#
#
schedule_map = compute_schedule(tasks_df, project_start)
print("Schedule computed successfully.", schedule_map)

# Add schedule to tasks_df
tasks_df['start_date'] = tasks_df['task_id'].apply(lambda x: schedule_map[x]['start_date'])
tasks_df['end_date'] = tasks_df['task_id'].apply(lambda x: schedule_map[x]['end_date'])
tasks_df['risk_score'] = tasks_df['task_id'].apply(lambda x: schedule_map[x]['risk_score'])
tasks_df['adjusted_duration'] = tasks_df['task_id'].apply(lambda x: schedule_map[x]['adjusted_duration'])

# Step 3: Fill NaN in 'change_applied' with 'None'
tasks_df['change_applied'] = tasks_df['change_applied'].fillna('None')

# Step 4: Save updated tasks
tasks_df.to_csv('result/updated_tasks.csv', index=False)

# Step 5: Print result
print("✅ Updated task list with change orders applied:\n")
# print(tasks_df)


✅ Change order C1 applied to task T3. Duration increased by 2 days.
✅ Change order C2 applied to task T6. Duration increased by 1 days.
✅ Change order C3 applied to task T9. Duration increased by 2 days.
Processing task T1 with duration 7 days and dependency nan
Adjusted duration for task T1 is now 10 days due to risk score impact of 0.7
Adjusted duration for task T1 is now 10 days due to inspection failures
-------------------------------------
Processing task T2 with duration 6 days and dependency T1
Adjusted duration for task T2 is now 6 days due to risk score impact of None
Adjusted duration for task T2 is now 6 days due to inspection failures
-------------------------------------
Processing task T3 with duration 11 days and dependency T2
Adjusted duration for task T3 is now 13 days due to risk score impact of 0.6
Adjusted duration for task T3 is now 15 days due to inspection failures
-------------------------------------
Processing task T4 with duration 5 days and dependency T3
Ad

In [3]:
# Ensure date columns are datetime
weather_df = weather_history_2024_df
weather_df['date'] = pd.to_datetime(weather_df['time'])
# print(tasks_df['start_date'], weather_df['date'])

# Step 2: Aggregate by date
daily_df = weather_df.groupby(weather_df['date'].dt.date).agg({
    'temperature_2m (°C)': 'mean',
    'weather_code (wmo code)': lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0],
    'rain (mm)': 'sum',
    # 'snowfall (cm)': 'sum',
    'precipitation_mm': 'mean',
    # 'apparent_temperature (°C)': 'mean',
    # 'relative_humidity_2m (%)': 'mean',
    # 'precipitation_probability (%)': 'max',
    # 'showers (mm)': 'sum',
    # 'snow_depth (m)': 'max',
    # 'pressure_msl (hPa)': 'mean',
    # 'surface_pressure (hPa)': 'mean',
    # 'visibility (m)': 'min',
    # 'evapotranspiration (mm)': 'sum',
    # 'et0_fao_evapotranspiration (mm)': 'sum',
    'wind_speed_10m (km/h)': 'mean',
    # 'wind_speed_80m (km/h)': 'mean',
    # 'wind_speed_120m (km/h)': 'mean',
    # 'wind_speed_180m (km/h)': 'mean',
    # 'wind_gusts_10m (km/h)': 'mean',
    # 'temperature_80m (°C)': 'mean',
    # 'temperature_120m (°C)': 'mean',
    # 'temperature_180m (°C)': 'mean',
    # 'soil_temperature_0cm (°C)': 'mean',
    # 'soil_temperature_6cm (°C)': 'mean',
    # 'soil_temperature_18cm (°C)': 'mean',
    # 'soil_moisture_0_to_1cm (m³/m³)': 'mean',
    # 'soil_moisture_1_to_3cm (m³/m³)': 'mean',
    # 'soil_moisture_3_to_9cm (m³/m³)': 'mean',
    # 'cloud_cover (%)': 'mean',
    # 'soil_moisture_9_to_27cm (m³/m³)': 'mean'
}).reset_index().rename(columns={'date': 'date'})


# Rename columns to match the aggregation results
daily_df = daily_df.rename(columns={
    'temperature_2m (°C)': 'temp_mean',
    'weather_code (wmo code)': 'weather_code_mode',
    'rain (mm)': 'rain_sum',
    # 'snowfall (cm)': 'snowfall_sum',
    'precipitation_mm': 'precipitation_mm',
    # 'apparent_temperature (°C)': 'apparent_temp_mean',
    # 'relative_humidity_2m (%)': 'humidity_mean',
    # 'precipitation_probability (%)': 'precip_prob_max',
    # 'showers (mm)': 'showers_sum',
    # 'snow_depth (m)': 'snow_depth_max',
    # 'pressure_msl (hPa)': 'pressure_msl_mean',
    # 'surface_pressure (hPa)': 'surface_pressure_mean',
    # 'visibility (m)': 'visibility_min',
    # 'evapotranspiration (mm)': 'evapotranspiration_sum',
    # 'et0_fao_evapotranspiration (mm)': 'et0_evapotranspiration_sum',
    'wind_speed_10m (km/h)': 'wind_mean',
    # 'wind_speed_80m (km/h)': 'wind_80m_mean',
    # 'wind_speed_120m (km/h)': 'wind_120m_mean',
    # 'wind_speed_180m (km/h)': 'wind_180m_mean',
    # 'wind_gusts_10m (km/h)': 'wind_gusts_mean',
    # 'temperature_80m (°C)': 'temp_80m_mean',
    # 'temperature_120m (°C)': 'temp_120m_mean',
    # 'temperature_180m (°C)': 'temp_180m_mean',
    # 'soil_temperature_0cm (°C)': 'soil_temp_0cm_mean',
    # 'soil_temperature_6cm (°C)': 'soil_temp_6cm_mean',
    # 'soil_temperature_18cm (°C)': 'soil_temp_18cm_mean',
    # 'soil_moisture_0_to_1cm (m³/m³)': 'soil_moisture_0_1_mean',
    # 'soil_moisture_1_to_3cm (m³/m³)': 'soil_moisture_1_3_mean',
    # 'soil_moisture_3_to_9cm (m³/m³)': 'soil_moisture_3_9_mean',
    # 'cloud_cover (%)': 'cloud_cover_mean',
    # 'soil_moisture_9_to_27cm (m³/m³)': 'soil_moisture_9_27_mean'
})

# In real use, replace this with actual delay data
# Let's simulate a delay if rain > 10mm or wind > 15km/h
# Add more weather parameters that might cause construction delays
# Delay if: rain_sum > 10mm, wind_mean > 15km/h, precipitation_mm > 0.5,
# temp_mean < 2°C or temp_mean > 35°C, or weather_code_mode in severe codes (e.g., 80, 95, 99)
severe_weather_codes = [80, 95, 99]  # heavy rain, thunderstorms, etc.

daily_df['delay'] = (
    (daily_df['rain_sum'] > 10) |
    (daily_df['wind_mean'] > 15) |
    (daily_df['precipitation_mm'] > 0.5) |
    (daily_df['temp_mean'] < 2) |
    (daily_df['temp_mean'] > 35) |
    (daily_df['weather_code_mode'].isin(severe_weather_codes))
).astype(int)

print("Daily Weather Data with Delays:")
print(daily_df.head())

daily_df.to_csv('result/daily_weather_aggregated.csv', index=False)
#
#
#
#

Daily Weather Data with Delays:
         date  temp_mean  weather_code_mode  rain_sum  precipitation_mm  \
0  2024-01-01   6.216667                  3       0.4          0.025000   
1  2024-01-02   4.804167                 61       7.6          0.316667   
2  2024-01-03   9.212500                  3       4.1          0.175000   
3  2024-01-04   4.170833                  3       2.1          0.095833   
4  2024-01-05   0.541667                  3       1.3          0.120833   

   wind_mean  delay  
0   9.695833      0  
1   9.716667      0  
2  14.883333      0  
3  14.062500      0  
4  11.166667      1  


In [6]:
# Ensure date columns are datetime
# tasks_df = pd.read_csv('data/tasks.csv')
tasks_df = pd.read_csv('result/updated_tasks.csv')
daily_weather_df = pd.read_csv('result/daily_weather_aggregated.csv')

tasks_df['start_date'] = pd.to_datetime(tasks_df['start_date'])
tasks_df['end_date'] = pd.to_datetime(tasks_df['end_date'])
weather_df = weather_history_2024_df
weather_df['date'] = pd.to_datetime(weather_df['time'])
daily_df['date'] = pd.to_datetime(weather_df['time'])
# Ensure daily_weather_df['date'] is datetime for comparison
daily_weather_df['date'] = pd.to_datetime(daily_weather_df['date'])
# print(tasks_df['start_date'], weather_df['date'])

#
#
#
#
# Function to predict weather delays and adjust task durations based on weather conditions
def predict_weather_delays(tasks_df, daily_weather_df):
    tasks_df['weather_delay_days'] = 0
    for index, task in tasks_df.iterrows():
        if task['weather_sensitive'] == 1:
            task_weather = daily_weather_df[(daily_weather_df['date'] >= task['start_date']) & 
                                      (daily_weather_df['date'] <= task['end_date'])]
            # print(task_weather)
            # delay_days = (((task_weather['precipitation_mm'] >= 2).sum()) / task_weather['precipitation_mm'].count())
    #         print("hi",(task_weather['precipitation_mm'] >= 2).sum(),task_weather['precipitation_mm'].count())
            delay_days = task_weather['delay'].sum()
            print(f"Task {task['task_id']} weather delay: {delay_days} days due to precipitation, Rain, and Wind.")
            tasks_df.at[index, 'weather_delay_days'] = delay_days
    tasks_df['new_duration'] = tasks_df['duration_days'] + tasks_df['weather_delay_days']
    return tasks_df

# Function to recalculate schedule
def recalculate_schedule(tasks_df):
    tasks_df['new_start_date'] = pd.NaT
    tasks_df['new_end_date'] = pd.NaT
    for index, task in tasks_df.iterrows():
        if pd.isna(task['depends_on']):
            tasks_df.at[index, 'new_start_date'] = task['start_date']
        else:
            predecessor_end = tasks_df[tasks_df['task_id'] == task['depends_on']]['new_end_date'].iloc[0]
            tasks_df.at[index, 'new_start_date'] = predecessor_end + timedelta(days=1)
        tasks_df.at[index, 'new_end_date'] = tasks_df.at[index, 'new_start_date'] + timedelta(days=task['new_duration'] - 1)
    return tasks_df


# Apply weather delays
tasks_df = predict_weather_delays(tasks_df, daily_weather_df)

# Recalculate schedule
tasks_df = recalculate_schedule(tasks_df)

# Format dates for output
tasks_df['new_start_date'] = tasks_df['new_start_date'].dt.strftime('%Y-%m-%d')
tasks_df['new_end_date'] = tasks_df['new_end_date'].dt.strftime('%Y-%m-%d')

# Print updated schedule
print("Updated Project Schedule with Weather Delays:")
print(tasks_df[['task_id', 'task_name', 'new_start_date', 'new_end_date', 'new_duration']])

# Calculate total project delay
original_end = tasks_df['end_date'].max()
new_end = pd.to_datetime(tasks_df['new_end_date'].max())
total_delay = (new_end - original_end).days
print(f"\nTotal Project Delay due to Weather: {total_delay} days")


Task T1 weather delay: 2 days due to precipitation, Rain, and Wind.
Task T2 weather delay: 2 days due to precipitation, Rain, and Wind.
Task T3 weather delay: 1 days due to precipitation, Rain, and Wind.
Task T4 weather delay: 0 days due to precipitation, Rain, and Wind.
Task T5 weather delay: 1 days due to precipitation, Rain, and Wind.
Task T6 weather delay: 0 days due to precipitation, Rain, and Wind.
Updated Project Schedule with Weather Delays:
  task_id               task_name new_start_date new_end_date  new_duration
0      T1              Excavation     2024-05-01   2024-05-09             9
1      T2         Soil Compaction     2024-05-10   2024-05-17             8
2      T3         Foundation Pour     2024-05-18   2024-05-29            12
3      T4  Basement Waterproofing     2024-05-30   2024-06-03             5
4      T5             Backfilling     2024-06-04   2024-06-08             5
5      T6            Slab Casting     2024-06-09   2024-06-15             7
6      T7     

In [7]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

tasks_df = pd.read_csv('data/tasks_2025.csv')

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": 52.52,
	"longitude": 13.41,
	"daily": ["sunset", "sunrise", "wind_speed_10m_max", "wind_gusts_10m_max", "uv_index_max", "weather_code", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_max", "apparent_temperature_min", "daylight_duration", "sunshine_duration", "uv_index_clear_sky_max", "rain_sum", "showers_sum", "snowfall_sum", "precipitation_sum", "precipitation_hours", "precipitation_probability_max", "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration"],
	"timezone": "Europe/London"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_sunset = daily.Variables(0).ValuesInt64AsNumpy()
daily_sunrise = daily.Variables(1).ValuesInt64AsNumpy()
daily_wind_speed_10m_max = daily.Variables(2).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(3).ValuesAsNumpy()
daily_uv_index_max = daily.Variables(4).ValuesAsNumpy()
daily_weather_code = daily.Variables(5).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(6).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(7).ValuesAsNumpy()
# daily_apparent_temperature_max = daily.Variables(8).ValuesAsNumpy()
# daily_apparent_temperature_min = daily.Variables(9).ValuesAsNumpy()
# daily_daylight_duration = daily.Variables(10).ValuesAsNumpy()
# daily_sunshine_duration = daily.Variables(11).ValuesAsNumpy()
# daily_uv_index_clear_sky_max = daily.Variables(12).ValuesAsNumpy()
daily_rain_sum = daily.Variables(13).ValuesAsNumpy()
daily_showers_sum = daily.Variables(14).ValuesAsNumpy()
daily_snowfall_sum = daily.Variables(15).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(16).ValuesAsNumpy()
# daily_precipitation_hours = daily.Variables(17).ValuesAsNumpy()
# daily_precipitation_probability_max = daily.Variables(18).ValuesAsNumpy()
# daily_wind_direction_10m_dominant = daily.Variables(19).ValuesAsNumpy()
# daily_shortwave_radiation_sum = daily.Variables(20).ValuesAsNumpy()
# daily_et0_fao_evapotranspiration = daily.Variables(21).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["sunset"] = daily_sunset
daily_data["sunrise"] = daily_sunrise
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_data["uv_index_max"] = daily_uv_index_max
daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
# daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
# daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
# daily_data["daylight_duration"] = daily_daylight_duration
# daily_data["sunshine_duration"] = daily_sunshine_duration
# daily_data["uv_index_clear_sky_max"] = daily_uv_index_clear_sky_max
daily_data["rain_sum"] = daily_rain_sum
daily_data["showers_sum"] = daily_showers_sum
daily_data["snowfall_sum"] = daily_snowfall_sum
daily_data["precipitation_sum"] = daily_precipitation_sum
# daily_data["precipitation_hours"] = daily_precipitation_hours
# daily_data["precipitation_probability_max"] = daily_precipitation_probability_max
# daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
# daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum
# daily_data["et0_fao_evapotranspiration"] = daily_et0_fao_evapotranspiration

daily_dataframe = pd.DataFrame(data = daily_data)
daily_dataframe['time'] = pd.to_datetime(daily_dataframe['date'])
# print(daily_dataframe)

def predict_weather_delays_live(tasks_df, weather_info_df):
    tasks_df['weather_delay_days'] = 0
    for index, task in tasks_df.iterrows():
        if task['weather_sensitive'] == 1:
            task_weather = weather_info_df[(weather_info_df['date'] >= task['start_date']) & 
                                      (weather_info_df['date'] <= task['end_date'])]
            # print(task_weather)
            delay_days = ((task_weather['precipitation_sum'] >= 2).sum())
            print("hi", task_weather['date'],'--------------------', delay_days)
            # print(f"Task {task['task_id']} weather delay: {delay_days} days due to precipitation, Rain, and Wind.")
            tasks_df.at[index, 'weather_delay_days'] = delay_days
    tasks_df['new_duration'] = tasks_df['duration_days'] + tasks_df['weather_delay_days']
    return tasks_df


# Function to recalculate schedule
def recalculate_schedule(tasks_df):
    tasks_df['new_start_date'] = pd.NaT
    tasks_df['new_end_date'] = pd.NaT
    for index, task in tasks_df.iterrows():
        if pd.isna(task['depends_on']):
            tasks_df.at[index, 'new_start_date'] = task['start_date']
        else:
            predecessor_end = tasks_df[tasks_df['task_id'] == task['depends_on']]['new_end_date'].iloc[0]
            tasks_df.at[index, 'new_start_date'] = predecessor_end + timedelta(days=1)
        tasks_df.at[index, 'new_end_date'] = tasks_df.at[index, 'new_start_date'] + timedelta(days=task['new_duration'] - 1)
    return tasks_df


# Apply weather delays
tasks_df = predict_weather_delays_live(tasks_df, daily_dataframe)

# Recalculate schedule
tasks_df = recalculate_schedule(tasks_df)

# Format dates for output
tasks_df['new_start_date'] = tasks_df['new_start_date'].dt.strftime('%Y-%m-%d')
tasks_df['new_end_date'] = tasks_df['new_end_date'].dt.strftime('%Y-%m-%d')

# Print updated schedule
print("Updated Project Schedule with Weather Delays:")
# print(tasks_df[['task_id', 'task_name','duration_days', 'new_start_date', 'new_end_date', 'new_duration']])
print(tasks_df[[ 'task_name','duration_days', 'new_start_date', 'new_end_date', 'new_duration']])

# # Calculate total project delay
# original_end = tasks_df['end_date'].max()
# new_end = pd.to_datetime(tasks_df['new_end_date'].max())
# original_end_ts = pd.to_datetime(original_end)
# total_delay = (new_end - original_end_ts).days
# print(f"\nTotal Project Delay due to Weather: {total_delay} days")


Coordinates 52.52000045776367°N 13.419998168945312°E
Elevation 38.0 m asl
Timezone b'Europe/London'b'GMT+1'
Timezone difference to GMT+0 3600 s
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
hi Series([], Name: date, dtype: datetime64[ns, UTC]) -------------------- 0
Updated Project Schedule with Weather Delays:
                task_name  duration_days new_start_date new_end_date  \
0              Excavation              7     2025-05-01   2025-05-07   
1         Soil Compaction              6     2025-05-08   2025-05-13   
2         Foundation Pour              9     2025-05-14   2025-05-22   
3  Basement Waterproofing              5     2025-05-23   20

In [7]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Generate dummy weather data for one month
def generate_dummy_weather_data(start_date='2025-05-01', days=31):
    data = []
    base_date = datetime.strptime(start_date, '%Y-%m-%d')
    for i in range(days):
        date = base_date + timedelta(days=i)
        row = {
            'date': date.strftime('%Y-%m-%d'),
            'temperature_max': round(np.random.uniform(10, 35), 1),
            'temperature_min': round(np.random.uniform(0, 15), 1),
            'rainfall': round(np.random.uniform(0, 20), 2),
            'windspeed': round(np.random.uniform(0, 25), 1),
            'humidity': round(np.random.uniform(30, 100), 1),
            'weather_code': random.choice([1, 2, 3, 45, 61, 95, 80]),
            'snowfall': round(np.random.uniform(0, 5), 2),
            'delayed': random.choice([0, 1])  # 0 = No Delay, 1 = Delay
        }
        data.append(row)
    return pd.DataFrame(data)




# Generate data
weather_df = generate_dummy_weather_data()
# weather_df = weather_history_2024_df

# Features and labels
features = ['temperature_max', 'temperature_min', 'rainfall', 'windspeed', 'humidity', 'weather_code', 'snowfall']
X = weather_df[features]
y = weather_df['delayed']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Output results
log_report = classification_report(y_test, y_pred_log, output_dict=True)
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)

weather_df.head(), log_report, rf_report


(         date  temperature_max  temperature_min  rainfall  windspeed  \
 0  2025-05-01             21.5              1.5      4.87        9.3   
 1  2025-05-02             13.3              2.5     17.74        2.0   
 2  2025-05-03             15.5             14.5      0.86       10.6   
 3  2025-05-04             28.5              0.1     12.08        8.3   
 4  2025-05-05             22.3              6.6      7.60       21.5   
 
    humidity  weather_code  snowfall  delayed  
 0      35.7            45      1.95        1  
 1      90.0            61      4.23        0  
 2      68.6             3      0.13        1  
 3      63.4            80      2.60        0  
 4      53.5            95      0.96        0  ,
 {'0': {'precision': 0.8333333333333334,
   'recall': 0.8333333333333334,
   'f1-score': 0.8333333333333334,
   'support': 6.0},
  '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0},
  'accuracy': 0.7142857142857143,
  'macro avg': {'precision': 0.41

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Generate dummy hourly weather data for January 2024
start_date = datetime(2024, 1, 1)
end_date = start_date + timedelta(days=31)
time_index = pd.date_range(start=start_date, end=end_date - timedelta(seconds=1), freq='H')
n_rows = len(time_index)

data = {
    'time': time_index,
    'temperature_2m (°C)': np.random.uniform(-5, 5, n_rows),
    'weather_code (wmo code)': np.random.randint(0, 100, n_rows),
    'rain (mm)': np.random.uniform(0, 5/12, n_rows),  # Adjusted for balance
    'snowfall (cm)': np.random.uniform(0, 1/12, n_rows),  # Adjusted for balance
    'precipitation (mm)': np.random.uniform(0, 5/12, n_rows),
    'apparent_temperature (°C)': np.random.uniform(-5, 5, n_rows),
    'relative_humidity_2m (%)': np.random.uniform(60, 100, n_rows),
    'precipitation_probability (%)': np.random.uniform(0, 100, n_rows),
    'showers (mm)': np.random.uniform(0, 3/12, n_rows),
    'snow_depth (m)': np.random.uniform(0, 0.09, n_rows),  # Adjusted to be < 0.1
    'pressure_msl (hPa)': np.random.uniform(990, 1010, n_rows),
    'surface_pressure (hPa)': np.random.uniform(990, 1010, n_rows),
    'visibility (m)': np.random.uniform(1000, 10000, n_rows),  # Adjusted to be > 1000
    'evapotranspiration (mm)': np.random.uniform(0, 2, n_rows),
    'et0_fao_evapotranspiration (mm)': np.random.uniform(0, 2, n_rows),
    'wind_speed_10m (km/h)': np.random.uniform(0, 40, n_rows),  # Adjusted to be < 50
    'wind_speed_80m (km/h)': np.random.uniform(0, 50, n_rows),
    'wind_speed_120m (km/h)': np.random.uniform(0, 60, n_rows),
    'wind_speed_180m (km/h)': np.random.uniform(0, 70, n_rows),
    'wind_gusts_10m (km/h)': np.random.uniform(0, 40, n_rows),
    'temperature_80m (°C)': np.random.uniform(-5, 5, n_rows),
    'temperature_120m (°C)': np.random.uniform(-5, 5, n_rows),
    'temperature_180m (°C)': np.random.uniform(-5, 5, n_rows),
    'soil_temperature_0cm (°C)': np.random.uniform(0, 5, n_rows),
    'soil_temperature_6cm (°C)': np.random.uniform(0, 5, n_rows),
    'soil_temperature_18cm (°C)': np.random.uniform(0, 5, n_rows),
    'soil_moisture_0_to_1cm (m³/m³)': np.random.uniform(0.2, 0.4, n_rows),
    'soil_moisture_1_to_3cm (m³/m³)': np.random.uniform(0.2, 0.4, n_rows),
    'soil_moisture_3_to_9cm (m³/m³)': np.random.uniform(0.2, 0.4, n_rows),
    'cloud_cover (%)': np.random.uniform(0, 100, n_rows),
    'soil_moisture_9_to_27cm (m³/m³)': np.random.uniform(0.2, 0.4, n_rows),
}

df = pd.DataFrame(data)
df.set_index('time', inplace=True)
# Step 2: Aggregate hourly data to daily data
aggregations = {
    'temperature_2m (°C)': 'mean',
    'weather_code (wmo code)': 'mean',
    'rain (mm)': 'sum',
    'snowfall (cm)': 'sum',
    'precipitation (mm)': 'sum',
    'apparent_temperature (°C)': 'mean',
    'relative_humidity_2m (%)': 'mean',
    'precipitation_probability (%)': 'mean',
    'showers (mm)': 'sum',
    'snow_depth (m)': 'max',
    'pressure_msl (hPa)': 'mean',
    'surface_pressure (hPa)': 'mean',
    'visibility (m)': 'min',
    'evapotranspiration (mm)': 'sum',
    'et0_fao_evapotranspiration (mm)': 'sum',
    'wind_speed_10m (km/h)': 'max',
    'wind_speed_80m (km/h)': 'max',
    'wind_speed_120m (km/h)': 'max',
    'wind_speed_180m (km/h)': 'max',
    'wind_gusts_10m (km/h)': 'max',
    'temperature_80m (°C)': 'mean',
    'temperature_120m (°C)': 'mean',
    'temperature_180m (°C)': 'mean',
    'soil_temperature_0cm (°C)': 'mean',
    'soil_temperature_6cm (°C)': 'mean',
    'soil_temperature_18cm (°C)': 'mean',
    'soil_moisture_0_to_1cm (m³/m³)': 'mean',
    'soil_moisture_1_to_3cm (m³/m³)': 'mean',
    'soil_moisture_3_to_9cm (m³/m³)': 'mean',
    'cloud_cover (%)': 'mean',
    'soil_moisture_9_to_27cm (m³/m³)': 'mean',
}

daily_df = df.resample('D').agg(aggregations)

# Step 3: Define the target variable 'suitable'
def is_suitable(row):
    if row['rain (mm)'] > 5 or \
       row['snowfall (cm)'] > 1 or \
       row['temperature_2m (°C)'] < 0 or row['temperature_2m (°C)'] > 35 or \
       row['wind_speed_10m (km/h)'] > 50 or \
       row['visibility (m)'] < 1000 or \
       row['snow_depth (m)'] > 0.1:
        return 0  # unsuitable
    else:
        return 1  # suitable

daily_df['suitable'] = daily_df.apply(is_suitable, axis=1)

# Step 4: Select features and target
features = [
    'temperature_2m (°C)',
    'rain (mm)',
    'snowfall (cm)',
    'precipitation (mm)',
    'relative_humidity_2m (%)',
    'wind_speed_10m (km/h)',
    'visibility (m)',
    'snow_depth (m)',
]

X = daily_df[features]
y = daily_df['suitable']

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

print("Logistic Regression Coefficients:")
for feature, coef in zip(features, logreg.coef_[0]):
    print(f"{feature}: {coef}")

# Step 7: Train random forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

print("Random Forest Feature Importances:")
for feature, importance in zip(features, rf.feature_importances_):
    print(f"{feature}: {importance}")

# Step 8: Identify days predicted as unsuitable
daily_df['predicted_suitable'] = rf.predict(X)
unsuitable_days = daily_df[daily_df['predicted_suitable'] == 0][['temperature_2m (°C)', 'rain (mm)', 'snowfall (cm)', 'wind_speed_10m (km/h)', 'visibility (m)', 'snow_depth (m)']]
print("\nDays Predicted as Unsuitable for Construction:")
print(unsuitable_days)

  time_index = pd.date_range(start=start_date, end=end_date - timedelta(seconds=1), freq='H')
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           0       0.83      0.83      0.83         6
           1       0.00      0.00      0.00         1

    accuracy                           0.71         7
   macro avg       0.42      0.42      0.42         7
weighted avg       0.71      0.71      0.71         7

Logistic Regression Coefficients:
temperature_2m (°C): 0.3989922259346297
rain (mm): -0.7476631074532596
snowfall (cm): -0.2753726825933657
precipitation (mm): -0.15641620445347068
relative_humidity_2m (%): -0.21689749694839813
wind_speed_10m (km/h): 0.3224632506847265
visibility (m): 0.004274748041129992
snow_depth (m): -7.073690999652993e-05
Random Forest Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       0.00      0.00      0.00         1

    accuracy                           0.86         7
   macro avg     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
# import seaborn as sns

# 1. Load the data
df = pd.read_csv('data/weather_history_2024.csv')

# 2. Select relevant features
selected_features = [
    'temperature_2m',
    'rain',
    'snowfall',
    'precipitation_probability',
    'relative_humidity_2m',
    'wind_speed_10m',
    'wind_gusts_10m',
    'visibility',
    'cloud_cover',
    'soil_moisture_0_to_1cm'
]

# 3. Drop rows with missing values (or you can impute them)
df = df[selected_features + ['delayed']].dropna()

# 4. Split into features and labels
X = df[selected_features]
y = df['delayed']

# 5. Normalize features (optional for tree models, good for LR)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# --- MODEL OPTION 1: Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# --- MODEL OPTION 2: Random Forest (More robust for non-linear features)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 7. Evaluation
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))
print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))

# 8. Confusion Matrix for RF
# plt.figure(figsize=(6,4))
# sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues')
# plt.title('Random Forest Confusion Matrix')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.tight_layout()
# plt.show()


KeyError: "None of [Index(['temperature_2m', 'rain', 'snowfall', 'precipitation_probability',\n       'relative_humidity_2m', 'wind_speed_10m', 'wind_gusts_10m',\n       'visibility', 'cloud_cover', 'soil_moisture_0_to_1cm', 'delayed'],\n      dtype='object')] are in the [columns]"