# Gather Data From Paths and Save
Get all the data from the 27000 or so simulations and save to a CSV file that we can then do EDA on and think about preprocessing for models

## Setup

In [101]:
import s3fs
import zarr
import json
from tqdm import tqdm
import pandas as pd
import pickle
from pprint import pprint
import os, time
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()

True

In [102]:
endpoint = 'https://wifire-data.sdsc.edu:9000'
access_key = os.getenv("ACCESS_KEY")
secret_key = os.getenv("SECRET_KEY")

fs = s3fs.S3FileSystem(key=access_key,
    secret=secret_key,
    client_kwargs={
        'endpoint_url': endpoint,
        'verify': False
    },
    skip_instance_cache=False
)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

name = 'quicfire.zarr'
bucket = 'burnpro3d/d'

root = list(fs.ls(bucket))

simulation_paths = []
print("successfully authenticated")

successfully authenticated


## Gather Data into DataFrame

In [150]:
def read_paths():
    paths = []
    with open("paths.txt","r") as f:
        paths = f.read().splitlines() 
    return paths

In [219]:
KEEP_ATTRIBUTES = {
    'path': lambda d: None,
    'time_scraped': lambda d: None,
    'canopy_moisture': lambda d: d['canopy_moisture'],
    'dz':lambda d: d['dz'],
    'extent': lambda d: d['extent'],
    'extent_fmt': lambda d: d['extent_fmt'],
    'fire_grid': lambda d: d['fire_grid'],
    'fuel': lambda d: d['fuel'],
    'ignition': lambda d: d['ignition'],
    'output': lambda d: d['output'],
    'resolution': lambda d: d['resolution'],
    'resolution_units': lambda d: d['resolution_units'],
    'run_binary': lambda d: d['run_binary'],
    'run_end': lambda d: d['run_end'],
    'run_max_mem_rss_bytes': lambda d: d['run_max_mem_rss_bytes'],
    'run_start': lambda d: d['run_start'],
    'seed': lambda d: d['seed'],
    'sim_time': lambda d: d['sim_time'],
    'surface_moisture': lambda d: d['surface_moisture'],
    'threads': lambda d: d['threads'],
    'timestep': lambda d: d['timestep'],
    'topo': lambda d: d['topo'],
    'wind_direction': lambda d: d['wind_direction'],
    'wind_speed': lambda d: d['wind_speed']
}

def get_df_chunk(stop):
    with open("vars.txt", "r") as file:
        start = int(file.read())
        print("start " , start)
    global simulation_paths, KEEP_ATTRIBUTES, incomplete, filenotfound

    df = pd.DataFrame([], columns=KEEP_ATTRIBUTES.keys())
    time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
    i = 0
    for p in tqdm(simulation_paths[start:stop]):
        try:
            with fs.open(p + '/' + name + '/.zattrs') as f:
                data=json.load(f)
        except:
            print("FileNotFound error on path {",p,"}")
            continue
            
        row = []
        for k,expr in KEEP_ATTRIBUTES.items():
            try:
                value = KEEP_ATTRIBUTES[k](data)
            except KeyError:
                value = None
            row.append(value)
        row[0] = p
        row[1] = time
        df.loc[i] = row
        i+=1
    with open("vars.txt", "w") as f:
            f.write(str(stop))
            print("\nRead from ", start, " to ", stop,"\n")
    return df

pd.set_option('display.max_columns', None)

In [220]:
simulation_paths = read_paths()
len(simulation_paths)

26997

In [221]:
BATCH_SIZE = 200

In [222]:

with open("vars.txt", "r") as file:
    start = int(file.read())

tic = time.perf_counter()
path_count = start + BATCH_SIZE
df = get_df_chunk(path_count)
toc = time.perf_counter()

df.head()
time_elapsed = f"{toc - tic:0.3f}"
print(f"Downloaded the dataframe chunk in {time_elapsed} seconds")


start  200


100%|██████████████████████████████████████| 100/100 [00:39<00:00,  2.53it/s]


Read from  200  to  300 

Downloaded the dataframe chunk in 39.564 seconds





In [223]:
len(df)

100

In [224]:
print("raw length: ",len(df))
df = df.dropna(subset=["run_end", "run_start", "sim_time"])
print("length after dropna: ",len(df))
df = df[ (df['run_binary'] == "/quicfire/quicfire-v5.2-ucsd-rel") & (df['threads'] == 1)]
print("length after filter version and single threading: ",len(df)) 
df.head()

raw length:  100
length after dropna:  87
length after filter version and single threading:  10


Unnamed: 0,path,time_scraped,canopy_moisture,dz,extent,extent_fmt,fire_grid,fuel,ignition,output,resolution,resolution_units,run_binary,run_end,run_max_mem_rss_bytes,run_start,seed,sim_time,surface_moisture,threads,timestep,topo,wind_direction,wind_speed
8,burnpro3d/d/02/5a/run_025a6ef4-0adc-4301-adf6-...,2023-07-27T17:07:09.509209,0.9,,"[-2097480.0, 2037504.0, -2096674.0, 2036700.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-19T22:30:33.046107-07:00,4232916,2023-06-19T15:55:24.866765-07:00,-1,8543,0.07,1,600,"{'topo_calcs': True, 'total_startup_iters': 500}",195.0,6.705
29,burnpro3d/d/02/90/run_0290190a-af45-4e51-bbde-...,2023-07-27T17:07:09.509209,0.3,,"[-2126863, 2649938, -2126015, 2648970]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2, 2, 1]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-06T11:09:08.159395-07:00,2659828,2023-06-06T10:25:22.577268-07:00,-1,4225,0.1,1,600,,180.0,20.0
32,burnpro3d/d/02/92/run_02928b61-8e6a-49cd-8008-...,2023-07-27T17:07:09.509209,0.3,,"[-2126863, 2649938, -2126015, 2648970]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2, 2, 1]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-05T15:38:16.067011-07:00,2658900,2023-06-05T15:09:28.440526-07:00,-1,4225,0.3,1,600,,120.0,5.0
34,burnpro3d/d/02/98/run_02988364-db45-43f4-88d1-...,2023-07-27T17:07:09.509209,1.0,,"[123782.0, 1223754.0, 124586.0, 1222950.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-07-21T14:13:24.126235-07:00,2461544,2023-07-21T13:15:57.836418-07:00,-1,5918,0.1,1,600,"{'topo_calcs': False, 'total_startup_iters': 500}",60.0,2.235
39,burnpro3d/d/02/a0/run_02a00526-849b-4dd3-8d26-...,2023-07-27T17:07:09.509209,0.9,,"[-1959092.0, 1310426.0, -1958552.0, 1309880.0]","[[x1, y1], [x2, y2]]",,,,"{'emissions': False, 'energy_atmos': False, 'f...","[2.0, 2.0, 1.0]",meters,/quicfire/quicfire-v5.2-ucsd-rel,2023-06-23T19:14:29.301921-07:00,452076,2023-06-23T18:36:59.287236-07:00,-1,19838,0.2,1,600,"{'topo_calcs': False, 'total_startup_iters': 500}",60.0,6.705


In [225]:
df.to_csv("simulation_runs.csv", mode="a",header=False)