In [19]:
import pandas as pd
import requests

In [2]:
backups = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_backups.csv")
datasets = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_datasets.csv")

In [3]:
datasets.columns = datasets.columns.str.lower()
datasets = datasets.fillna('')
datasets.head()

Unnamed: 0,dataset,notes,dataset_id,url,websites,organization,agency,last_modified
0,Billion-Dollar Weather and Climate Disasters,,1,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10
1,American Communities Survey (ACS),,3,https://www.census.gov/programs-surveys/acs,census.gov,Census Bureau,Department of Commerce,2025-03-03
2,BLS Downloads,,6,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-10
3,CDC FTP,,7,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-10
4,US Census Bureau FTP,,8,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-11


In [4]:
agencies = datasets.agency.value_counts().keys()
categories = categories = ['arts-culture-history','health-human-services',
'budget-finance','parks-recreation','economy','planning-zoning',
'education','public-safety','elections-politics','real-estate-land-records',
'environment','transportation','food','uncategorized']


In [5]:
agency_to_category = {
    'Department of Health and Human Services': 'Health / Human Services',
    'Department of Commerce': 'Economy',
    'Department of Housing and Urban Development': 'Real Estate / Land Records',
    'Department of Veterans Affairs': 'Health / Human Services',
    'National Endowment for the Humanities': 'Arts / Culture / History',
    'AmeriCorps': 'Public Safety',
    'Department of Education': 'Education',
    'Federal Mediation and Conciliation Service': 'Economy',
    'Department of Homeland Security': 'Public Safety',
    'Department of Energy': 'Environment',
    'National Labor Relations Board': 'Economy',
    'Environmental Protection Agency': 'Environment',
    'Consumer Financial Protection Bureau': 'Budget / Finance',
    'Federal Housing Finance Agency': 'Real Estate / Land Records',
    'Department of the Treasury': 'Budget / Finance',
    'Institute of Museum and Library Services': 'Arts / Culture / History',
    'Department of the Interior': 'Parks / Recreation',
    'General Services Administration': 'Economy',
    'Department of Labor': 'Economy',
    'U.S. Agency for International Development': 'Health / Human Services',
    'Department of Transportation': 'Transportation',
    'National Aeronautics and Space Administration': 'Environment',
    '': 'Uncategorized',
    'Department of Justice': 'Public Safety',
    'Department of the Interior, National Parks Service': 'Parks / Recreation',
    'Department of State': 'Elections / Politics',
    'National Science Foundation': 'Education',
    'Department of Health and Human Services, Department of Commerce': 'Health / Human Services',
    'Consumer Financial Protection Bureau, Federal Housing Finance Agency': 'Budget / Finance',
    'U.S. Department of Agriculture': 'Food',
    'Office of Management and Budget': 'Budget / Finance'
}

In [6]:
backups.columns = backups.columns.str.lower()
backups = backups.fillna('')
backups.head()

Unnamed: 0,dataset,dataset_id,status,url,source_website,organization,agency,download_date,size,maintainer,download_location,file_type,notes,metadata_available,metadata_url
0,Billion-Dollar Weather and Climate Disasters,1,Finished,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10,0.15,HD,https://dataverse.harvard.edu/dataset.xhtml?pe...,ZIP,,yes,https://dataverse.harvard.edu/dataset.xhtml?pe...
1,BLS Downloads,6,Finished,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-01,47.0,DRP,,,,,
2,CDC FTP,7,Finished,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-01,213.0,DRP,,,,,
3,US Census Bureau FTP,8,Finished,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-01,180.0,DRP,,,"Partial download, server is back online but co...",,
4,National Hurricane Center (NHC),9,Finished,https://www.nhc.noaa.gov/archive,nhc.noaa.gov,NOAA/National Hurricane Center,Department of Commerce,2025-02-06,61.0,DRP,,,,,


In [7]:
import re
def slugify(string):
    string = clean_text(string)
    # Remove special characters
    string = re.sub(r'[^\w\s-]', '', string)
    # Replace spaces with hyphens
    string = re.sub(r'\s+', '-', string)
    # Convert to lowercase
    string = string.lower()
    return string

def clean_text(string):
    # Remove URL prefixes like http:// or https://
    # string = re.sub(r'http[s]?://', '', string)
    # Remove escape strings like \n
    string = string.replace('\n', '').replace('\r', '').replace('\t', '')
    # Remove leading '-'
    string = re.sub(r'^-', '', string)
    # Replace ':' with '-'
    string = string.replace(':', '')
    return string

def get_dataset_categories(agency):
    return agency_to_category[agency]

In [8]:
def get_metadata_availability(dataset_id):
    """
    This function checks the metadata availability for dataset_id 432 in the backups dataframe.
    It returns "Yes" if metadata is available, "Under Review" if it needs review, and "No" otherwise.
    """
    md_avl = backups[backups.dataset_id == dataset_id].metadata_available.values
    if "yes" in md_avl:

        return "Yes",backups[backups.dataset_id == dataset_id].metadata_url.values[0]
    elif "needs review" in md_avl:
        return "Under Review",""
    else:
        return "No",""

def create_dataset_md(row):
    if row['organization'] == '':
      row['organization'] = 'Unknown'
    ## Defining the schema, filename and path
    schema = 'data_rescue_project'
    dataset_filename = slugify(row['dataset'])
    dataset_path = "_datasets"
    org_filename = slugify(row['organization'])
    org_path = "_organizations"

    ## Get backups for each dataset
    data_backups = backups[backups.dataset == row['dataset']]
    metadata_available, metadata_url = get_metadata_availability(row['dataset_id'])
    ## Creating the dataset markdown file
    ## Dataset-level information
    dataset_md = "---\n"
    dataset_md += f"schema: {schema} \n"
    dataset_md += f"title: {clean_text(row['dataset'])}\n"
    dataset_md += f"organization: {clean_text(row['organization'])}\n"
    dataset_md += f"agency: {clean_text(row['agency'])}\n"
    dataset_md += f"websites: {row['websites']}\n"
    dataset_md += f"data_source: {row['url']}\n"
    dataset_md += f"description: {clean_text(row['notes'])}\n"
    dataset_md += f"last_modified: {row['last_modified']}\n"
    ## Check if any backups have metadata available and populate
    dataset_md += f"metadata_available: {metadata_available}\n"
    dataset_md += f"metadata_url: {metadata_url}\n"
    dataset_md += f"category:\n"
    dataset_md += f"  - {get_dataset_category(clean_text(row['agency']))}\n"

    dataset_md += f"resources:\n"
    ## Resource-level information
    for index, backup_row in data_backups.iterrows():
      dataset_md += f"  - id: {index}\n"
      dataset_md += f"    url: {backup_row['download_location']}\n"
      dataset_md += f"    format: {clean_text(backup_row['file_type'])}\n"
      dataset_md += f"    status: {clean_text(backup_row['status'])}\n"
      dataset_md += f"    size: {backup_row['size']}\n"
      dataset_md += f"    download_date: {backup_row['download_date']}\n"
      dataset_md += f"    maintainer: {clean_text(backup_row['maintainer'])}\n"
      dataset_md += f"    notes: {clean_text(backup_row['notes'])}\n"
    dataset_md += "---\n"
      
    ## Writing the dataset markdown file
    with open(f'{dataset_path}/{dataset_filename}.md', 'w') as output:
      output.write(dataset_md)
    
    ## Creating the organization markdown file
    org_md = "---\n"
    org_md += f"title: {clean_text(row['organization'])} \n" 
    org_md += f"description: \n" 
    org_md += "---\n"

    ## Writing the organization markdown file
    with open(f'{org_path}/{org_filename}.md', 'w') as output:
      output.write(org_md)

In [9]:
datasets.shape

(731, 8)

In [13]:
datasets.apply(create_dataset_md, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
726    None
727    None
728    None
729    None
730    None
Length: 731, dtype: object

In [10]:
import os
def remove_files_os(dir_path):
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)


In [11]:
remove_files_os('_datasets')

In [1]:
from python.create_markdowns import *

In [2]:
create_markdowns()

In [None]:
BASEROW_ACCESS_TOKEN = 

def stringify_arr_vals(arr):
    return ';'.join([i['value'] for i in arr])

def get_results_json(url):
    table = requests.get(
        url,
        headers={
            "Authorization": f"Token {BASEROW_ACCESS_TOKEN}"
        }
    )

    res = table.json()['results']
    if table.json()['next'] is not None:
        res.extend(get_results_json(table.json()['next']))

    return res

categories = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/732/?user_field_names=true"))[['Name', 'Active']]
organizations = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/638/?user_field_names=true"))[['Organizations', 'Categories']]
organizations['Categories'] = organizations['Categories'].apply(lambda x: stringify_arr_vals(x))
categories.to_csv("baserow_exports/datarescue_categories.csv", index=False)
organizations.to_csv("baserow_exports/datarescue_organizations.csv", index=False)

In [7]:
def slugify(string):
    string = clean_text(string)
    # Remove special characters
    string = re.sub(r'[^\w\s-]', '', string)
    # Replace spaces with hyphens
    string = re.sub(r'\s+', '-', string)
    # Convert to lowercase
    string = string.lower()
    # Remove leading '-'
    string = re.sub(r'^-', '', string)
    # Remove trailing '-'
    string = re.sub(r'-$', '', string)
    # Remove leading and trailing whitespace
    string = string.strip()
    # Remove multiple hyphens
    string = re.sub(r'-+', '-', string)
    return string

In [5]:
string =  'https://efile.fara.gov/ords/fara/f?p=1381:1:7299515917868:::::'
string = string.rstrip(':')
# # Remove leading and trailing ':'
# string = string.replace(':$', '')
# string = string.replace('^:', '')

In [6]:
string

'https://efile.fara.gov/ords/fara/f?p=1381:1:7299515917868'

In [6]:
categories = pd.read_csv("https://raw.githubusercontent.com/datarescueproject/portal/refs/heads/main/baserow_exports/datarescue_categories.csv")
categories['Active'] = categories['Active'].astype(str).str.lower()

In [7]:
categories

Unnamed: 0,Name,Active
0,Agriculture,True
1,Arts & Culture,True
2,Business & Economy,True
3,Climate & Environment,True
4,Education,True
5,Energy,True
6,Finance & Budget,True
7,Health & Healthcare,True
8,Housing & Community Development,True
9,Humanitarian & Disaster Relief,True


In [9]:
categories.apply(lambda x: slugify(x['Name']), axis=1)


NameError: name 'clean_text' is not defined

In [10]:
organizations

Unnamed: 0,id,order,Organizations,Websites,Agency,Count (Websites),Datasets,Count (Datasets),Categories
0,10447,1.00000000000000000000,American Battle Monuments Commission,"[{'id': 48123, 'value': 'abmc.gov', 'order': '...","[{'id': 17, 'value': 'American Battle Monument...",3,[],0,"[{'id': 2, 'value': 'Arts & Culture', 'order':..."
1,10448,2.00000000000000000000,Barry Goldwater Scholarship and Excellence in ...,"[{'id': 48126, 'value': 'goldwaterscholarship....","[{'id': 18, 'value': 'Barry Goldwater Scholars...",1,[],0,"[{'id': 5, 'value': 'Education', 'order': '5.0..."
2,10449,3.00000000000000000000,Consumer Financial Protection Bureau,"[{'id': 48127, 'value': 'beta.cfpb.gov', 'orde...","[{'id': 19, 'value': 'Consumer Financial Prote...",10,"[{'ids': {'database_table_639': 14, 'database_...",17,"[{'id': 3, 'value': 'Business & Economy', 'ord..."
3,10450,4.00000000000000000000,Delta Regional Authority,"[{'id': 48137, 'value': 'dra.gov', 'order': '1...","[{'id': 20, 'value': 'Delta Regional Authority...",1,[],0,"[{'id': 3, 'value': 'Business & Economy', 'ord..."
4,10451,5.00000000000000000000,Denali Commission,"[{'id': 48138, 'value': 'cf.denali.gov', 'orde...","[{'id': 21, 'value': 'Denali Commission', 'ord...",4,[],0,"[{'id': 3, 'value': 'Business & Economy', 'ord..."
...,...,...,...,...,...,...,...,...,...
416,10865,418.00000000000000000000,Federal Mediation and Conciliation Service,"[{'id': 57770, 'value': 'fmcs.gov', 'order': '...","[{'id': 90, 'value': 'Federal Mediation and Co...",1,"[{'ids': {'database_table_639': 748, 'database...",40,"[{'id': 13, 'value': 'Labor & Employment', 'or..."
417,10866,419.00000000000000000000,Health Resources and Services Administration,[],"[{'id': 91, 'value': 'Health Resources and Ser...",0,[],0,"[{'id': 8, 'value': 'Health & Healthcare', 'or..."
418,10867,420.00000000000000000000,Institute of International Education,"[{'id': 57773, 'value': 'opendoorsdata.org', '...","[{'id': 92, 'value': 'Institute of Internation...",1,"[{'ids': {'database_table_639': 911, 'database...",1,"[{'id': 5, 'value': 'Education', 'order': '5.0..."
419,10868,421.00000000000000000000,U.S. Patent and Trademark Office,[],"[{'id': 93, 'value': 'U.S. Patent and Trademar...",0,[],0,"[{'id': 3, 'value': 'Business & Economy', 'ord..."


In [11]:
def get_arr_vals(arr):
    return ';'.join([i['value'] for i in arr])

In [12]:
organizations = organizations[['Organizations','Categories']]

In [13]:
organizations.Categories = organizations.Categories.apply(lambda x: get_arr_vals(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organizations.Categories = organizations.Categories.apply(lambda x: get_arr_vals(x))


In [17]:
organizations[organizations['Organizations'] == 'Denali Commission']['Categories'].str.split(';')

4    [Business & Economy, Infrastructure]
Name: Categories, dtype: object

In [2]:
organizations

NameError: name 'organizations' is not defined

In [None]:
BASEROW_ACCESS_TOKEN = 


def stringify_arr_vals(arr):
    return ';'.join([i['value'] for i in arr])


def get_results_json(url):
    table = requests.get(
        url,
        headers={
            "Authorization": f"Token {BASEROW_ACCESS_TOKEN}"
        }
    )

    res = table.json()['results']
    if table.json()['next'] is not None:
        res.extend(get_results_json(table.json()['next']))

    return res


def get_arr_vals(arr, col):
    return ", ".join([str(x[col]) for x in arr])


def check_missing_vals(field, col="value"):
    if len(field) > 0:
        val = get_arr_vals(field, col=col)
    else:
        val = ""
    
    return val


def process_dataset_row(d):
    return {
        "dataset": d["Name"],
        "notes": d["Notes"],
        "dataset_id": d["id"],
        "url": d["URL"],
        "websites": get_arr_vals(d["Websites"], col="value"),
        "organization": get_arr_vals(d["Organization"], col="value"),
        "agency": get_arr_vals(d["Agency"], col="value"),
        "categories": d["Categories"],
        "last_modified": d["Last modified"]
    }


def process_backup_row(d):
    if len(d["Dataset"]) > 0:
        if d["Metadata Available"]:
            metadata_avl = d["Metadata Available"]["value"]
        else:
            metadata_avl = ""
        return {
            "dataset": check_missing_vals(d["Dataset"], col="value"),
            "dataset_id": check_missing_vals(d["Dataset"], col="id"),
            "status": d["Status"]["value"],
            "url": check_missing_vals(d["Dataset URL"], col="value"),
            "source_website": check_missing_vals(d["Website"], col="value"),
            "organization": check_missing_vals(d["Organization"], col="value"),
            "agency": check_missing_vals(d["Agency"], col="value"),
            "download_date": d["Backup date"],
            "size": d["Backup size"],
            "maintainer": get_arr_vals(d["Maintainer"], col="value"),
            "download_location": d["Backup location"],
            "file_type": get_arr_vals(d["File type"], col="value"),
            "notes": d["Notes"],
            "metadata_available": metadata_avl,
            "metadata_url": d["Metadata URL"]
        }
    else:
        return


dataset_table = get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/639/?user_field_names=true")
backups_table = get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/640/?user_field_names=true")
categories = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/732/?user_field_names=true"))[['Name', 'Active']]
organizations = pd.DataFrame(get_results_json("https://baserow.datarescueproject.org/api/database/rows/table/638/?user_field_names=true"))[['Organizations', 'Categories']]
organizations['Categories'] = organizations['Categories'].apply(lambda x: stringify_arr_vals(x))

rows = []
for row in backups_table:
    rows.append(process_backup_row(row))

rows = [row for row in rows if row is not None]
backups = pd.DataFrame(rows)

rows = []
for row in dataset_table:
    rows.append(process_dataset_row(row))

rows = [row for row in rows if row is not None]
datasets = pd.DataFrame(rows)

datasets.to_csv("baserow_exports/datarescue_datasets.csv", index=False)
backups.to_csv("baserow_exports/datarescue_backups.csv", index=False)
categories.to_csv("baserow_exports/datarescue_categories.csv", index=False)
organizations.to_csv("baserow_exports/datarescue_organizations.csv", index=False)

In [17]:
a = "[{'id': 4, 'value': 'Climate & Environment', 'order': '4.00000000000000000000'}, {'id': 8, 'value': 'Health & Healthcare', 'order': '8.00000000000000000000'}]"

if eval(a):
    cats = [b['value'] for b in eval(a)]
else:
    print("No categories found")


In [18]:
cats

['Climate & Environment', 'Health & Healthcare']