In [1]:
# weather_fetch_safe.py
import requests
import pandas as pd
import os
from datetime import datetime, timedelta
import time

API_URL = "https://archive-api.open-meteo.com/v1/archive"
LATITUDE = 5.3173
LONGITUDE = 75.7139
OUT_DIR = "data/raw"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "weather_historical.csv")

# Choose your date range (note: adjust if you really want "last 10 years")
START_DATE = "1998-01-01"
END_DATE = datetime.now().strftime("%Y-%m-%d")

# daily variables as comma-joined string (safe)
daily_vars = ",".join([
    "temperature_2m_max",
    "temperature_2m_min",
    "precipitation_sum",
    "rain_sum",
    "sunshine_duration"
])

def daterange_year_chunks(start_date_str, end_date_str):
    start = datetime.fromisoformat(start_date_str)
    end = datetime.fromisoformat(end_date_str)
    cur = start
    while cur < end:
        # chunk by one calendar year
        chunk_end = datetime(cur.year, 12, 31)
        if chunk_end > end:
            chunk_end = end
        yield cur.strftime("%Y-%m-%d"), chunk_end.strftime("%Y-%m-%d")
        cur = chunk_end + timedelta(days=1)

rows = []
for chunk_start, chunk_end in daterange_year_chunks(START_DATE, END_DATE):
    params = {
        "latitude": LATITUDE,
        "longitude": LONGITUDE,
        "start_date": chunk_start,
        "end_date": chunk_end,
        "daily": daily_vars,
        "timezone": "auto"
    }
    # simple retry/backoff
    for attempt in range(4):
        try:
            r = requests.get(API_URL, params=params, timeout=60)
            r.raise_for_status()
            payload = r.json()
            if 'daily' not in payload:
                raise ValueError("No 'daily' in API response: " + str(payload)[:200])
            d = payload['daily']
            # map fields safely; check presence
            needed = ['time', 'temperature_2m_max', 'temperature_2m_min', 'precipitation_sum', 'rain_sum', 'sunshine_duration']
            for key in needed:
                if key not in d:
                    raise KeyError(f"Missing key in daily: {key}")
            for i, dt in enumerate(d['time']):
                rows.append({
                    'date': dt,
                    'temp_max_C': d['temperature_2m_max'][i],
                    'temp_min_C': d['temperature_2m_min'][i],
                    'precip_mm': d['precipitation_sum'][i],
                    'rain_mm': d['rain_sum'][i],
                    'sunshine_s': d['sunshine_duration'][i]
                })
            time.sleep(0.2)  # be polite
            break
        except (requests.RequestException, ValueError, KeyError) as e:
            wait = 2 ** attempt
            print(f"Attempt {attempt+1} failed for {chunk_start} to {chunk_end}: {e}. Retrying in {wait}s.")
            time.sleep(wait)
    else:
        raise RuntimeError(f"Failed to fetch data for {chunk_start}..{chunk_end}")

# build DataFrame
df = pd.DataFrame(rows)
# convert types and compute derived columns
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)
df['sunshine_h'] = df['sunshine_s'] / 3600.0
df = df.drop(columns=['sunshine_s'])

# save
df.to_csv(OUT_CSV, index=False)
print(f"Saved {len(df)} daily rows to {OUT_CSV}")


Saved 10179 daily rows to data/raw\weather_historical.csv


In [6]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

np.random.seed(42)  # reproducible
RAW_DATA_PATH = 'data/raw'
os.makedirs(RAW_DATA_PATH, exist_ok=True)
OUTPUT_FILE = os.path.join(RAW_DATA_PATH, 'farm_records_simulated.csv')

N_RECORDS_PER_CROP = 2500
CROPS = ['Jowar', 'Paddy', 'Maize', 'Cotton']
TOTAL_RECORDS = N_RECORDS_PER_CROP * len(CROPS)

SIM_START_DATE = datetime(1998, 1, 1)
SIM_END_DATE = datetime(2024, 1, 1)
DATE_RANGE_DAYS = (SIM_END_DATE - SIM_START_DATE).days

crop_list = np.repeat(CROPS, N_RECORDS_PER_CROP)
random_days = np.random.randint(0, DATE_RANGE_DAYS, TOTAL_RECORDS)
planting_dates = [SIM_START_DATE + timedelta(days=int(d)) for d in random_days]
growth_days = np.random.randint(90, 151, TOTAL_RECORDS)
harvest_dates = [pd + timedelta(days=int(gd)) for pd, gd in zip(planting_dates, growth_days)]

fertilizer = np.random.uniform(50, 200, TOTAL_RECORDS).round(2)
pesticide_l_ha = np.random.uniform(0, 50, TOTAL_RECORDS).round(2)  # new pesticide in L/ha

# extra categorical features
soil_types = ['clay', 'sandy', 'loam']
varieties = ['A', 'B', 'C']
field_ids = np.random.randint(1, 200, TOTAL_RECORDS)  # repeat fields across seasons

base_yields = {'Jowar': 3000, 'Paddy': 6000, 'Maize': 5000, 'Cotton': 2500}

def generate_yield(crop, fert, pest_l, soil):
    base = base_yields[crop]
    # crop-specific sensitivities
    fert_eff = {'Jowar': 3.0, 'Paddy': 4.0, 'Maize': 4.5, 'Cotton': 2.5}[crop]
    pest_eff = {'Jowar': 2.0, 'Paddy': 1.5, 'Maize': 2.2, 'Cotton': 1.0}[crop]
    soil_adj = {'clay': -100, 'sandy': -150, 'loam': 50}[soil]
    noise = np.random.normal(0, base * 0.12)
    # treat pesticide in L/ha, center at 10 L/ha in sensitivity formula
    y = base + fert_eff * (fert - 125) + pest_eff * (pest_l - 10) + soil_adj + noise
    return round(max(0.0, y), 2)

soil_choice = np.random.choice(soil_types, TOTAL_RECORDS)
variety_choice = np.random.choice(varieties, TOTAL_RECORDS)

yield_data = [generate_yield(c, f, p, s) for c, f, p, s in zip(crop_list, fertilizer, pesticide_l_ha, soil_choice)]

df_simulated = pd.DataFrame({
    'field_id': field_ids,
    'crop': crop_list,
    'variety': variety_choice,
    'soil_type': soil_choice,
    'planting_date': planting_dates,
    'harvest_date': harvest_dates,
    'fertilizer_kg_ha': fertilizer,
    'pesticide_l_ha': pesticide_l_ha,
    'yield_kg_ha': yield_data
})

df_simulated = df_simulated.sample(frac=1, random_state=42).reset_index(drop=True)
df_simulated.to_csv(OUTPUT_FILE, index=False)
print(f"Saved {len(df_simulated)} records to {OUTPUT_FILE}")
print(df_simulated.head())


Saved 10000 records to data/raw\farm_records_simulated.csv
   field_id   crop variety soil_type planting_date harvest_date  \
0       185  Maize       B      clay    2021-11-16   2022-04-03   
1       138  Paddy       C      clay    2018-06-14   2018-10-14   
2       164  Jowar       A      loam    2008-06-25   2008-11-07   
3        46  Paddy       C      clay    2010-10-28   2011-03-10   
4       122  Paddy       B     sandy    2015-11-07   2016-03-26   

   fertilizer_kg_ha  pesticide_l_ha  yield_kg_ha  
0            192.18           43.10      4835.92  
1            176.73           43.03      5877.93  
2             60.87           32.08      3182.76  
3             94.32           11.61      5705.05  
4            150.60            0.91      4580.22  
