In [49]:
import pandas as pd
from IPython.display import HTML, display
from IPython.display import Markdown as md
with open('welcome.md','r') as f:
    s = f.read()
display(md(s))

## Welcome to the Open-FF Notebook Tools
This file is one of the tools developed to interact with Open-FF data. While it is a "coding notebook," it requires NO programming skills.  Pompts will guide you throughout the notebook indicating what you need to do in each section.

These noteboks are designed to accomplish well-defined tasks.  More general exploration is probably easier with the [Data Browser](https://storage.googleapis.com/open-ff-browser/Open-FF_Catalog.html). On the other hand, more custom analysis will probably require downloading data sets targeted to your task.  (If you need assistance with the latter, feel free to contact us.)

This notebooks is designed to be used in the [Google Colaboratory](https://colab.google/) service, a free platform that lets users run notebooks without needing to setup complicated "environments."  We will assume you are using this notebook on Colab and instructions are provided below.  


# Open-FF data set customizer
Use this notebook to convert Open-FF's full dataframe (that is in "parquet" format) into a different format and, if desired, filter the data to a smaller subset.

In [None]:
import pandas as pd
import requests 
df_url = "https://storage.googleapis.com/open-ff-common/repos/current_repo/full_df.parquet"

response = requests.head(df_url,  # Example file 
    allow_redirects=True
)
print(f"Full size on remote disk: {int(response.headers['Content-Length']):,} bytes") 
print('Fetching it now, please be patient...')
df = pd.read_parquet(df_url)
print(f'The full Open-FF data frame shape (rows,cols): {df.shape}')

In [None]:
# make output location
# this code may overwrite files in the "sandbox" directory.  Make sure you want to do that
import os
tmp_dir = 'sandbox'
try:
    os.mkdir(tmp_dir)
except:
    print(f'{tmp_dir} already exists')

# Filter by state
Select state(s) that you want in the output file.  The selection box contains only those states in the Open-FF data.  Use `shift` or `cntl` click to select more than one state.

In [None]:
st_lst = df.bgStateName.unique().tolist()
st_lst.sort()
st_lst.insert(0,'All states')
states = widgets.SelectMultiple(
    options=st_lst,
    value=['All states'],
    #rows=10,
    description='Select State(s)',
    disabled=False
)
states

In [None]:
list(states.value)

In [None]:
if states.value[0]!='All states':
    df = df[df.bgStateName.isin(list(states.value))]
print(f"The current filtered data frame's shape (rows,cols): {df.shape}")

# Select the output format

The current formats available are:
- "parquet" - an compressed structured format for large files. (recommended if you have the ability to use it)
- "CSV" - a traditional, text based format, standard input for spreadsheets. Can require 10x the storage and processing time as "parquet," This format does not keep Open-FF formatting so you may need to specify, for example, which columns are text and which are numeric (e.g. APINumber should be text because of leading zeros).


In [None]:
import ipywidgets as widgets
format_type = widgets.ToggleButtons(
    options=['parquet', 'CSV'],
    description='Output type:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    #tooltips=['Description of slow', 'Description of regular', 'Description of fast'],
#     icons=['check'] * 3
)
format_type

In [None]:
# save the output file

if format_type.value=='CSV':
    # make the CSV file
    outfn = os.path.join(tmp_dir,"custom.csv")
    df.to_csv(outfn)

if format_type.value=='parquet':
    outfn = os.path.join(tmp_dir,"custom.parquet")
    df.to_parquet(outfn)

file_size = os.path.getsize(outfn)
print("File Size is :", file_size, "bytes")
print(f'Output saved at: {outfn}, size: {file_size:,} bytes') 
