In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# FireRisk

In [2]:
# loading the dataset
def loading_the_data(data_dir):
    # Generate data paths with labels
    filepaths = []
    labels = []

    # Get folder names
    folds = os.listdir(data_dir)

    for fold in folds:
        foldpath = os.path.join(data_dir, fold)
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)
            
            filepaths.append(fpath)
            labels.append(fold)

    # Concatenate data paths with labels into one DataFrame
    Fseries = pd.Series(filepaths, name='filepaths')
    Lseries = pd.Series(labels, name='labels')

    df = pd.concat([Fseries, Lseries], axis=1)
    
    return df

In [3]:
dir = '..\FireRisk\\train'

In [4]:
train_df = loading_the_data(dir)
train_df

Unnamed: 0,filepaths,labels
0,G:\FireRisk\train\High\27032281_4_-103.4304412...,High
1,G:\FireRisk\train\High\27038991_4_-77.77273442...,High
2,G:\FireRisk\train\High\27040201_4_-73.83896834...,High
3,G:\FireRisk\train\High\27042071_4_-122.1662712...,High
4,G:\FireRisk\train\High\27042401_4_-121.1231610...,High
...,...,...
70326,G:\FireRisk\train\Water\35471591_7_-72.4088150...,Water
70327,G:\FireRisk\train\Water\35484351_7_-82.8478475...,Water
70328,G:\FireRisk\train\Water\35487101_7_-72.1588909...,Water
70329,G:\FireRisk\train\Water\35497331_7_-90.9452064...,Water


In [5]:
label_mapping = {
    "Very_Low": 1,
    "Low": 2,
    "Moderate": 3,
    "High": 4,
    "Very_High": 5,
    "Non-burnable": 6,
    "Water": 7
}

train_df['labels'] = train_df['labels'].map(label_mapping)
train_df

Unnamed: 0,filepaths,labels
0,G:\FireRisk\train\High\27032281_4_-103.4304412...,4
1,G:\FireRisk\train\High\27038991_4_-77.77273442...,4
2,G:\FireRisk\train\High\27040201_4_-73.83896834...,4
3,G:\FireRisk\train\High\27042071_4_-122.1662712...,4
4,G:\FireRisk\train\High\27042401_4_-121.1231610...,4
...,...,...
70326,G:\FireRisk\train\Water\35471591_7_-72.4088150...,7
70327,G:\FireRisk\train\Water\35484351_7_-82.8478475...,7
70328,G:\FireRisk\train\Water\35487101_7_-72.1588909...,7
70329,G:\FireRisk\train\Water\35497331_7_-90.9452064...,7


In [6]:
def extract_coordinates(filepath):
    # Remove the '.png' extension
    filename = filepath.split("\\")[-1].replace(".png", "")
    
    # Split the filename by underscores and get the last two parts
    parts = filename.split("_")
    pointid = parts[-4]
    xcoord = parts[-2]
    ycoord = parts[-1]
    
    return int(pointid), float(xcoord), float(ycoord)

In [7]:
train_df[['pointid', 'xcoord', 'ycoord']] = train_df['filepaths'].apply(lambda x: pd.Series(extract_coordinates(x)))
train_df['pointid'] = train_df['pointid'].astype(int)
train_df

Unnamed: 0,filepaths,labels,pointid,xcoord,ycoord
0,G:\FireRisk\train\High\27032281_4_-103.4304412...,4,27032281,-103.430441,44.280426
1,G:\FireRisk\train\High\27038991_4_-77.77273442...,4,27038991,-77.772734,43.225073
2,G:\FireRisk\train\High\27040201_4_-73.83896834...,4,27040201,-73.838968,42.608234
3,G:\FireRisk\train\High\27042071_4_-122.1662712...,4,27042071,-122.166271,41.842358
4,G:\FireRisk\train\High\27042401_4_-121.1231610...,4,27042401,-121.123161,42.054689
...,...,...,...,...,...
70326,G:\FireRisk\train\Water\35471591_7_-72.4088150...,7,35471591,-72.408815,40.992578
70327,G:\FireRisk\train\Water\35484351_7_-82.8478475...,7,35484351,-82.847848,42.499614
70328,G:\FireRisk\train\Water\35487101_7_-72.1588909...,7,35487101,-72.158891,40.942600
70329,G:\FireRisk\train\Water\35497331_7_-90.9452064...,7,35497331,-90.945206,43.063911


In [8]:
print("Latitude (Y) Statistics:")
print(train_df['ycoord'].describe())

print("\nLongitude (X) Statistics:")
print(train_df['xcoord'].describe())

Latitude (Y) Statistics:
count    70331.000000
mean        42.920221
std          0.947672
min         40.062187
25%         42.272775
50%         43.118342
75%         43.652187
max         44.487786
Name: ycoord, dtype: float64

Longitude (X) Statistics:
count    70331.000000
mean      -100.548428
std         14.575530
min       -124.437311
25%       -113.045916
50%       -101.339618
75%        -89.484487
max        -71.826773
Name: xcoord, dtype: float64


# GDAL

## WHP

In [9]:
# whp_df = pd.read_csv("G:/gdal/whp2020_cnt_conus_wgs84_filter.csv")
# whp_df

In [10]:
# print("Latitude (Y) Statistics:")
# print(whp_df['latitude'].describe())

# print("\nLongitude (X) Statistics:")
# print(whp_df['longitude'].describe())

## Round

In [11]:
# # whp_df['longitude'] = whp_df['longitude'].round(3)
# # whp_df['latitude'] = whp_df['latitude'].round(3)

# duplicates = whp_df[whp_df.duplicated(subset=['longitude', 'latitude'], keep=False)]
# print(f"Number of duplicate rows: {duplicates.shape[0]}")
# print(duplicates)
# whp_df

In [12]:
# # train_df['xcoord'] = train_df['xcoord'].round(3)
# # train_df['ycoord'] = train_df['ycoord'].round(3)

# duplicates = train_df[train_df.duplicated(subset=['xcoord', 'ycoord'], keep=False)]
# print(f"Number of duplicate rows: {duplicates.shape[0]}")
# print(duplicates)
# train_df

## Merge

In [13]:
# merged_df = pd.merge(train_df, whp_df, left_on=['xcoord', 'ycoord'], right_on=['longitude', 'latitude'], how='left')
# merged_df

In [14]:
# non_nan = merged_df.dropna(subset=['longitude', 'latitude'])
# print(non_nan.shape[0])
# non_nan

In [15]:
# mismatches = non_nan[non_nan['labels'] != non_nan['label']]

# print(f"Number of mismatches: {len(mismatches)}")
# mismatches

## Nearest Merge

In [16]:
# import geopandas as gpd
# from shapely.geometry import Point

# # del merged_df
# # Convert both DataFrames to GeoDataFrames
# train_gdf = gpd.GeoDataFrame(train_df, geometry=gpd.points_from_xy(train_df.xcoord, train_df.ycoord))
# whp_gdf = gpd.GeoDataFrame(whp_df, geometry=gpd.points_from_xy(whp_df.longitude, whp_df.latitude))

# # Set coordinate reference systems (CRS) if needed (assuming both are in WGS84 here)
# train_gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)
# whp_gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

# # Perform spatial join within a certain distance (0.0025 ~ 270m)
# tolerance = 0.0025
# merged_df = gpd.sjoin_nearest(train_gdf, whp_gdf, max_distance=tolerance, how='left')

In [17]:
# merged_df

In [18]:
# merged_df = merged_df.drop(columns=['geometry'])
# merged_df

In [19]:
# mismatches = merged_df[merged_df['labels'] != merged_df['label']]

# print(f"Number of mismatches: {len(mismatches)}")
# mismatches

In [20]:
# merged_df.to_csv("G:/gdal/conversion_cnt.csv", index=False)

# ArcGIS

## WHP

In [21]:
whp_df = pd.read_csv("../RasterT_whp20202_Sampled2_TableToExcel.csv")
whp_df

Unnamed: 0,OBJECTID,pointid,grid_code,POINT_X,POINT_Y
0,1,27032281,1237,-103.430452,44.280433
1,2,27032391,155,-103.058300,44.300728
2,3,27032501,109,-102.685932,44.319982
3,4,27032611,66,-102.313358,44.338195
4,5,27032721,26,-101.940589,44.355365
...,...,...,...,...,...
100689,100690,38108071,37,-106.074263,42.359962
100690,100691,38108181,179,-105.713264,42.387566
100691,100692,38108291,1810,-105.351984,42.414167
100692,100693,38108401,69,-104.990432,42.439764


In [22]:
print("\nLongitude (X) Statistics:")
print(whp_df['POINT_X'].describe())

print("Latitude (Y) Statistics:")
print(whp_df['POINT_Y'].describe())


Longitude (X) Statistics:
count    100694.000000
mean        -98.649892
std          15.400127
min        -124.482654
25%        -111.977307
50%         -99.177558
75%         -86.255003
max         -69.867527
Name: POINT_X, dtype: float64
Latitude (Y) Statistics:
count    100694.000000
mean         42.677116
std           0.995592
min          39.688454
25%          41.965517
50%          42.799785
75%          43.447623
max          44.487794
Name: POINT_Y, dtype: float64


## Merge

In [23]:
merged_df = pd.merge(train_df, whp_df, left_on=['pointid'], right_on=['pointid'], how='left')
merged_df

Unnamed: 0,filepaths,labels,pointid,xcoord,ycoord,OBJECTID,grid_code,POINT_X,POINT_Y
0,G:\FireRisk\train\High\27032281_4_-103.4304412...,4,27032281,-103.430441,44.280426,1,1237,-103.430452,44.280433
1,G:\FireRisk\train\High\27038991_4_-77.77273442...,4,27038991,-77.772734,43.225073,62,628,-77.772737,43.225082
2,G:\FireRisk\train\High\27040201_4_-73.83896834...,4,27040201,-73.838968,42.608234,73,718,-73.838970,42.608243
3,G:\FireRisk\train\High\27042071_4_-122.1662712...,4,27042071,-122.166271,41.842358,90,805,-122.166285,41.842363
4,G:\FireRisk\train\High\27042401_4_-121.1231610...,4,27042401,-121.123161,42.054689,93,1093,-121.123175,42.054694
...,...,...,...,...,...,...,...,...,...
70326,G:\FireRisk\train\Water\35471591_7_-72.4088150...,7,35471591,-72.408815,40.992578,76722,0,-72.408816,40.992586
70327,G:\FireRisk\train\Water\35484351_7_-82.8478475...,7,35484351,-82.847848,42.499614,76838,0,-82.847852,42.499623
70328,G:\FireRisk\train\Water\35487101_7_-72.1588909...,7,35487101,-72.158891,40.942600,76863,0,-72.158892,40.942609
70329,G:\FireRisk\train\Water\35497331_7_-90.9452064...,7,35497331,-90.945206,43.063911,76956,0,-90.945213,43.063919


In [24]:
non_nan = merged_df.dropna(subset=['pointid'])
print(non_nan.shape[0])
non_nan

70331


Unnamed: 0,filepaths,labels,pointid,xcoord,ycoord,OBJECTID,grid_code,POINT_X,POINT_Y
0,G:\FireRisk\train\High\27032281_4_-103.4304412...,4,27032281,-103.430441,44.280426,1,1237,-103.430452,44.280433
1,G:\FireRisk\train\High\27038991_4_-77.77273442...,4,27038991,-77.772734,43.225073,62,628,-77.772737,43.225082
2,G:\FireRisk\train\High\27040201_4_-73.83896834...,4,27040201,-73.838968,42.608234,73,718,-73.838970,42.608243
3,G:\FireRisk\train\High\27042071_4_-122.1662712...,4,27042071,-122.166271,41.842358,90,805,-122.166285,41.842363
4,G:\FireRisk\train\High\27042401_4_-121.1231610...,4,27042401,-121.123161,42.054689,93,1093,-121.123175,42.054694
...,...,...,...,...,...,...,...,...,...
70326,G:\FireRisk\train\Water\35471591_7_-72.4088150...,7,35471591,-72.408815,40.992578,76722,0,-72.408816,40.992586
70327,G:\FireRisk\train\Water\35484351_7_-82.8478475...,7,35484351,-82.847848,42.499614,76838,0,-82.847852,42.499623
70328,G:\FireRisk\train\Water\35487101_7_-72.1588909...,7,35487101,-72.158891,40.942600,76863,0,-72.158892,40.942609
70329,G:\FireRisk\train\Water\35497331_7_-90.9452064...,7,35497331,-90.945206,43.063911,76956,0,-90.945213,43.063919


In [25]:
merged_df.to_csv("../conversion_cnt.csv", index=False)