# Gather Data From Paths and Save
Get all the data from the 27000 or so simulations and save to a CSV file that we can then do EDA on and think about preprocessing for models

## Setup

In [101]:
import s3fs
import zarr
import json
from tqdm import tqdm
import pandas as pd
import pickle
from pprint import pprint
import os, time
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()

True

In [102]:
endpoint = 'https://wifire-data.sdsc.edu:9000'
access_key = os.getenv("ACCESS_KEY")
secret_key = os.getenv("SECRET_KEY")

fs = s3fs.S3FileSystem(key=access_key,
    secret=secret_key,
    client_kwargs={
        'endpoint_url': endpoint,
        'verify': False
    },
    skip_instance_cache=False
)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

name = 'quicfire.zarr'
bucket = 'burnpro3d/d'

root = list(fs.ls(bucket))

simulation_paths = []
print("successfully authenticated")

successfully authenticated


## Gather Data into DataFrame

In [150]:
def read_paths():
    paths = []
    with open("paths.txt","r") as f:
        paths = f.read().splitlines() 
    return paths

In [226]:
KEEP_ATTRIBUTES = {
    'path': lambda d: None,
    'time_scraped': lambda d: None,
    'canopy_moisture': lambda d: d['canopy_moisture'],
    'dz':lambda d: d['dz'],
    'extent': lambda d: d['extent'],
    'extent_fmt': lambda d: d['extent_fmt'],
    'fire_grid': lambda d: d['fire_grid'],
    'fuel': lambda d: d['fuel'],
    'ignition': lambda d: d['ignition'],
    'output': lambda d: d['output'],
    'resolution': lambda d: d['resolution'],
    'resolution_units': lambda d: d['resolution_units'],
    'run_binary': lambda d: d['run_binary'],
    'run_end': lambda d: d['run_end'],
    'run_max_mem_rss_bytes': lambda d: d['run_max_mem_rss_bytes'],
    'run_start': lambda d: d['run_start'],
    'seed': lambda d: d['seed'],
    'sim_time': lambda d: d['sim_time'],
    'surface_moisture': lambda d: d['surface_moisture'],
    'threads': lambda d: d['threads'],
    'timestep': lambda d: d['timestep'],
    'topo': lambda d: d['topo'],
    'wind_direction': lambda d: d['wind_direction'],
    'wind_speed': lambda d: d['wind_speed']
}

def get_df_chunk(stop):
    with open("vars.txt", "r") as file:
        start = int(file.read())
        print("start " , start)
    global simulation_paths, KEEP_ATTRIBUTES, incomplete, filenotfound

    df = pd.DataFrame([], columns=KEEP_ATTRIBUTES.keys())
    time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    i = 0
    for p in tqdm(simulation_paths[start:stop]):
        try:
            with fs.open(p + '/' + name + '/.zattrs') as f:
                data=json.load(f)
        except:
            print("FileNotFound error on path {",p,"}")
            continue
            
        row = []
        for k,expr in KEEP_ATTRIBUTES.items():
            try:
                value = KEEP_ATTRIBUTES[k](data)
            except KeyError:
                value = None
            row.append(value)
        row[0] = p
        row[1] = time
        df.loc[i] = row
        i+=1
    with open("vars.txt", "w") as f:
            f.write(str(stop))
            print("\nRead from ", start, " to ", stop,"\n")
    return df

pd.set_option('display.max_columns', None)

In [227]:
simulation_paths = read_paths()
len(simulation_paths)

26997

In [228]:
BATCH_SIZE = 200

In [229]:

with open("vars.txt", "r") as file:
    start = int(file.read())

tic = time.perf_counter()
path_count = start + BATCH_SIZE
df = get_df_chunk(path_count)
toc = time.perf_counter()

df.head()
time_elapsed = f"{toc - tic:0.3f}"
print(f"Downloaded the dataframe chunk in {time_elapsed} seconds")


start  300


 76%|████████████████████████████▉         | 152/200 [01:10<00:36,  1.32it/s]

FileNotFound error on path { burnpro3d/d/05/33/run_0533bc87-5a75-4cdb-87e8-f96f8296ca9a }


100%|██████████████████████████████████████| 200/200 [01:27<00:00,  2.28it/s]


Read from  300  to  500 

Downloaded the dataframe chunk in 87.862 seconds





In [230]:
len(df)

199

In [231]:
print("raw length: ",len(df))
df = df.dropna(subset=["run_end", "run_start", "sim_time"])
print("length after dropna: ",len(df))
df = df[ (df['run_binary'] == "/quicfire/quicfire-v5.2-ucsd-rel") & (df['threads'] == 1)]
print("length after filter version and single threading: ",len(df)) 
df.head()

raw length:  199
length after dropna:  160
length after filter version and single threading:  16


Unnamed: 0,path,time_scraped,canopy_moisture,dz,extent,extent_fmt,fire_grid,fuel,ignition,output,resolution,resolution_units,run_binary,run_end,run_max_mem_rss_bytes,run_start,seed,sim_time,surface_moisture,threads,timestep,topo,wind_direction,wind_speed
6,burnpro3d/d/03/61/run_0361425b-d728-4092-a153-...,2023-07-27T17:10:48.294526,0.4,,"[-2126863, 2649938, -2126015, 2648970]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2, 2, 1]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-06T11:34:45.390760-07:00,2659060,2023-06-06T10:24:59.120364-07:00,-1,4225,0.1,1,600,,90.0,15.0
8,burnpro3d/d/03/6a/run_036ade61-cbfa-46bc-b0ba-...,2023-07-27T17:10:48.294526,0.8,,"[-2022346.0, 1581420.0, -2020914.0, 1580036.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-14T15:18:48.475738-07:00,2470240,2023-06-14T12:24:11.387122-07:00,-1,11359,0.1,1,600,"{'topo_calcs': False, 'total_startup_iters': 500}",150.0,3.129
18,burnpro3d/d/03/83/run_03832093-083a-4e32-bd50-...,2023-07-27T17:10:48.294526,0.9,,"[-1959092.0, 1310426.0, -1958552.0, 1309880.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-23T14:36:17.260056-07:00,452140,2023-06-23T14:23:46.833300-07:00,-1,5223,0.15,1,600,"{'topo_calcs': False, 'total_startup_iters': 500}",90.0,4.47
31,burnpro3d/d/03/a6/run_03a6d907-755a-4fcf-94e2-...,2023-07-27T17:10:48.294526,0.9,,"[-2097480.0, 2037504.0, -2096674.0, 2036700.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-19T20:56:54.097372-07:00,4231776,2023-06-19T16:06:07.456839-07:00,-1,8543,0.11,1,600,"{'topo_calcs': True, 'total_startup_iters': 500}",225.0,3.129
41,burnpro3d/d/03/c9/run_03c9e618-341e-447b-be36-...,2023-07-27T17:10:48.294526,0.9,,"[-2022346.0, 1581420.0, -2020914.0, 1580036.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-23T17:19:10.728883-07:00,2428228,2023-06-23T14:18:23.649738-07:00,-1,13493,0.1,1,600,"{'topo_calcs': False, 'total_startup_iters': 500}",60.0,4.47


In [232]:
df.to_csv("simulation_runs.csv", mode="a",header=False)

# Now read the data and do some stuff!