In [None]:
fire_df = pd.read_csv("nepal_fire_data_cleaned.csv", low_memory=False)
fire_df['acq_date'] = pd.to_datetime(fire_df['acq_date']0, format='%Y-%m-%d')

In [14]:
import pandas as pd

# Load all columns as strings temporarily to inspect
fire_df_raw = pd.read_csv("nepal_fire_data_cleaned.csv", dtype=str)

# Check column names
print(fire_df_raw.columns)

# Let's peek at column 10 (index 10)
fire_df_raw.iloc[:, 10].value_counts(dropna=False).head(20)


Index(['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date',
       'acq_time', 'satellite', 'instrument', 'confidence', 'version',
       'bright_t31', 'frp', 'daynight', 'type'],
      dtype='object')


version
2         358613
2.0NRT     19694
Name: count, dtype: int64

In [19]:
fire_df = fire_df_raw.drop(columns=['version'])


In [20]:
# If dropped:
fire_df = fire_df_raw.drop(columns=['version'])

# Or if cleaned:
fire_df = fire_df_raw.copy()

# Sanitize column names (optional good practice)
fire_df.columns = fire_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check
fire_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378307 entries, 0 to 378306
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   latitude    378307 non-null  object 
 1   longitude   378307 non-null  object 
 2   brightness  378307 non-null  object 
 3   scan        378307 non-null  object 
 4   track       378307 non-null  object 
 5   acq_date    378307 non-null  object 
 6   acq_time    378307 non-null  object 
 7   satellite   378307 non-null  object 
 8   instrument  378307 non-null  object 
 9   confidence  378307 non-null  object 
 10  version     378307 non-null  object 
 11  bright_t31  378307 non-null  object 
 12  frp         378307 non-null  float64
 13  daynight    378307 non-null  object 
 14  type        358613 non-null  object 
dtypes: float64(1), object(14)
memory usage: 43.3+ MB


In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load your filtered Nepal fire dataset
fire_df = pd.read_csv("nepal_fire_data_cleaned.csv")

# Parse date if not already
fire_df['acq_date'] = pd.to_datetime(fire_df['acq_date'])

# Define Nepal bounding box
min_lat, max_lat = 26.3, 30.5
min_lon, max_lon = 80.0, 89.5

# Step 1: Create random no-fire samples
def generate_no_fire_samples(n_samples, existing_fire_df):
    np.random.seed(42)

    # Sample dates based on real fire date distribution
    sampled_dates = np.random.choice(existing_fire_df['acq_date'], size=n_samples)

    # Generate random lat/lon in Nepal bounds
    random_lats = np.random.uniform(min_lat, max_lat, n_samples)
    random_lons = np.random.uniform(min_lon, max_lon, n_samples)

    # Assemble DataFrame
    no_fire_df = pd.DataFrame({
        'latitude': random_lats,
        'longitude': random_lons,
        'acq_date': sampled_dates
    })

    # Remove any accidental overlap with real fire points
    no_fire_df = no_fire_df.merge(
        existing_fire_df[['latitude', 'longitude', 'acq_date']],
        on=['latitude', 'longitude', 'acq_date'],
        how='left',
        indicator=True
    ).query('_merge == "left_only"').drop(columns=['_merge'])

    # Add label
    no_fire_df['fire_occurred'] = 0

    return no_fire_df

# Step 2: Label fire data as 1 and reduce columns
fire_df_reduced = fire_df[['latitude', 'longitude', 'acq_date']].copy()
fire_df_reduced['fire_occurred'] = 1

# Step 3: Generate same number of no-fire samples
no_fire_df = generate_no_fire_samples(len(fire_df_reduced), fire_df_reduced)

# Step 4: Combine both datasets
full_df = pd.concat([fire_df_reduced, no_fire_df], ignore_index=True).sample(frac=1).reset_index(drop=True)

# Save
full_df.to_csv("fire_vs_no_fire_locations.csv", index=False)

print(f"Combined dataset saved. Total samples: {len(full_df)}")
print(full_df.head())


  fire_df = pd.read_csv("nepal_fire_data_cleaned.csv")


Combined dataset saved. Total samples: 756614
    latitude  longitude   acq_date  fire_occurred
0  29.266180  80.475300 2018-05-28              1
1  28.237445  87.186904 2015-03-28              0
2  29.020737  86.023081 2024-05-07              0
3  29.712733  84.082472 2017-04-12              0
4  28.382866  81.633441 2014-03-30              0


### Testing the dataset first

In [23]:
#load and slit
import pandas as pd

# Load your raw fire+no-fire merged file
df = pd.read_csv("fire_vs_no_fire_locations.csv")

# Split by label
df_fire = df[df['fire_occurred'] == 1]
df_no_fire = df[df['fire_occurred'] == 0]

# Sample smaller balanced subset (adjust numbers if needed)
df_fire_sample = df_fire.sample(n=2500, random_state=42)
df_no_fire_sample = df_no_fire.sample(n=2500, random_state=42)

# Combine and shuffle
df_small = pd.concat([df_fire_sample, df_no_fire_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

# Save to use for enrichment
df_small.to_csv("fire_vs_no_fire_sample.csv", index=False)

print(f" Reduced sample dataset created. Shape: {df_small.shape}")


 Reduced sample dataset created. Shape: (5000, 4)


In [29]:
import pandas as pd
import requests
from tqdm import tqdm
import time
import math
from datetime import datetime

# Load dataset
df = pd.read_csv("fire_vs_no_fire_locations.csv")

# Convert date column and filter invalid dates
df['acq_date'] = pd.to_datetime(df['acq_date'], errors='coerce')
df = df.dropna(subset=['acq_date'])

# Remove rows with future dates (NASA POWER does not support them)
today = pd.to_datetime(datetime.utcnow().date())
df = df[df['acq_date'] < today].copy()

# Initialize new columns for enriched features
df['temperature'] = None
df['humidity'] = None
df['wind_speed'] = None
df['precipitation'] = None
df['elevation'] = None
df['vpd'] = None

# Function to fetch weather data from NASA POWER
def fetch_weather(lat, lon, date):
    url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    params = {
        "parameters": "T2M,RH2M,WS2M,PRECTOTCORR",
        "community": "AG",
        "longitude": lon,
        "latitude": lat,
        "start": date.strftime('%Y%m%d'),
        "end": date.strftime('%Y%m%d'),
        "format": "JSON"
    }

    try:
        r = requests.get(url, params=params, timeout=10)
        r.raise_for_status()
        data = r.json()
        props = data["properties"]["parameter"]
        key = date.strftime('%Y%m%d')
        return {
            "temperature": props["T2M"].get(key),
            "humidity": props["RH2M"].get(key),
            "wind_speed": props["WS2M"].get(key),
            "precipitation": props["PRECTOTCORR"].get(key)
        }
    except Exception as e:
        print(f"Weather error for {lat},{lon},{date}: {e}")
        return None

# Function to fetch elevation from OpenTopoData
def fetch_elevation(lat, lon):
    url = f"https://api.opentopodata.org/v1/srtm90m?locations={lat},{lon}"
    try:
        r = requests.get(url)
        r.raise_for_status()
        return r.json()["results"][0]["elevation"]
    except Exception as e:
        print(f"Elevation error for {lat},{lon}: {e}")
        return None

# Function to compute VPD (Vapor Pressure Deficit)
def compute_vpd(temp, rh):
    try:
        es = 0.6108 * math.exp((17.27 * temp) / (temp + 237.3))  # Saturation vapor pressure
        ea = es * (rh / 100.0)                                   # Actual vapor pressure
        return round(es - ea, 3)
    except:
        return None

# Enrichment loop
for i, row in tqdm(df.iterrows(), total=len(df)):
    lat, lon, date = row['latitude'], row['longitude'], row['acq_date']

    weather = fetch_weather(lat, lon, date)
    if weather:
        df.at[i, 'temperature'] = weather['temperature']
        df.at[i, 'humidity'] = weather['humidity']
        df.at[i, 'wind_speed'] = weather['wind_speed']
        df.at[i, 'precipitation'] = weather['precipitation']
        df.at[i, 'vpd'] = compute_vpd(weather['temperature'], weather['humidity'])

    elevation = fetch_elevation(lat, lon)
    if elevation is not None:
        df.at[i, 'elevation'] = elevation

    time.sleep(0.6)  # Respect API rate limits

# Save enriched dataset
df.to_csv("fire_dataset_enriched.csv", index=False)
print("Enrichment complete. Saved as fire_dataset_enriched.csv.")


  today = pd.to_datetime(datetime.utcnow().date())
  0%|                                                                           | 2/756614 [00:08<918:35:06,  4.37s/it]


KeyboardInterrupt: 