In [33]:
import pandas as pd
import requests

In [34]:
backups = pd.read_csv("https://raw.githubusercontent.com/kaushik12/DataRescue_jkan/refs/heads/main/baserow_exports/datarescue_backups.csv")
datasets = pd.read_csv("https://raw.githubusercontent.com/kaushik12/DataRescue_jkan/refs/heads/main/baserow_exports/datarescue_datasets.csv")

In [35]:
datasets.dtypes

dataset             object
notes               object
dataset_id           int64
url                 object
websites            object
organization        object
agency              object
last_modified       object
last_modified_by    object
dtype: object

In [37]:
datasets.columns = datasets.columns.str.lower()
datasets = datasets.fillna('')
datasets.head()

Unnamed: 0,dataset,notes,dataset_id,url,websites,organization,agency,last_modified,last_modified_by
0,Billion-Dollar Weather and Climate Disasters,,1,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10,Cataloger
1,American Communities Survey (ACS),,3,https://www.census.gov/programs-surveys/acs,census.gov,Census Bureau,Department of Commerce,2025-03-03,Cataloger
2,BLS Downloads,,6,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-10,Admin
3,CDC FTP,,7,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-10,Admin
4,US Census Bureau FTP,,8,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-11,Admin


In [59]:
agencies = datasets.agency.value_counts().keys()
categories = categories = ['arts-culture-history','health-human-services',
'budget-finance','parks-recreation','economy','planning-zoning',
'education','public-safety','elections-politics','real-estate-land-records',
'environment','transportation','food','uncategorized']


In [73]:
agency_to_category = {
    'Department of Health and Human Services': 'Health / Human Services',
    'Department of Commerce': 'Economy',
    'Department of Housing and Urban Development': 'Real Estate / Land Records',
    'Department of Veterans Affairs': 'Health / Human Services',
    'National Endowment for the Humanities': 'Arts / Culture / History',
    'AmeriCorps': 'Public Safety',
    'Department of Education': 'Education',
    'Federal Mediation and Conciliation Service': 'Economy',
    'Department of Homeland Security': 'Public Safety',
    'Department of Energy': 'Environment',
    'National Labor Relations Board': 'Economy',
    'Environmental Protection Agency': 'Environment',
    'Consumer Financial Protection Bureau': 'Budget / Finance',
    'Federal Housing Finance Agency': 'Real Estate / Land Records',
    'Department of the Treasury': 'Budget / Finance',
    'Institute of Museum and Library Services': 'Arts / Culture / History',
    'Department of the Interior': 'Parks / Recreation',
    'General Services Administration': 'Economy',
    'Department of Labor': 'Economy',
    'U.S. Agency for International Development': 'Health / Human Services',
    'Department of Transportation': 'Transportation',
    'National Aeronautics and Space Administration': 'Environment',
    '': 'Uncategorized',
    'Department of Justice': 'Public Safety',
    'Department of the Interior, National Parks Service': 'Parks / Recreation',
    'Department of State': 'Elections / Politics',
    'National Science Foundation': 'Education',
    'Department of Health and Human Services, Department of Commerce': 'Health / Human Services',
    'Consumer Financial Protection Bureau, Federal Housing Finance Agency': 'Budget / Finance',
    'U.S. Department of Agriculture': 'Food',
    'Office of Management and Budget': 'Budget / Finance'
}

In [38]:
backups.columns = backups.columns.str.lower()
backups = backups.fillna('')
backups.head()

Unnamed: 0,dataset,dataset_id,status,url,source_website,organization,agency,download_date,size,maintainer,download_location,file_type,notes,metadata_available,metadata_url
0,Billion-Dollar Weather and Climate Disasters,1,Finished,https://www.ncei.noaa.gov/access/billions/mapping,ncei.noaa.gov,National Oceanic and Atmospheric Administration,Department of Commerce,2025-02-10,0.15,HD,https://dataverse.harvard.edu/dataset.xhtml?pe...,ZIP,,yes,https://dataverse.harvard.edu/dataset.xhtml?pe...
1,BLS Downloads,6,Finished,https://download.bls.gov,download.bls.gov,Bureau of Labor Statistics,Department of Labor,2025-02-01,47.0,DRP,,,,,
2,CDC FTP,7,Finished,https://ftp.cdc.gov/,ftp.cdc.gov,Centers for Disease Control and Prevention,Department of Health and Human Services,2025-02-01,213.0,DRP,,,,,
3,US Census Bureau FTP,8,Finished,ftp://ftp.census.gov,census.gov,Census Bureau,Department of Commerce,2025-02-01,180.0,DRP,,,"Partial download, server is back online but co...",,
4,National Hurricane Center (NHC),9,Finished,https://www.nhc.noaa.gov/archive,nhc.noaa.gov,NOAA/National Hurricane Center,Department of Commerce,2025-02-06,61.0,DRP,,,,,


In [70]:
import re
def slugify(string):
    string = clean_text(string)
    # Remove special characters
    string = re.sub(r'[^\w\s-]', '', string)
    # Replace spaces with hyphens
    string = re.sub(r'\s+', '-', string)
    # Convert to lowercase
    string = string.lower()
    return string

def clean_text(string):
    # Remove URL prefixes like http:// or https://
    # string = re.sub(r'http[s]?://', '', string)
    # Remove escape strings like \n
    string = string.replace('\n', '').replace('\r', '').replace('\t', '')
    # Remove leading '-'
    string = re.sub(r'^-', '', string)
    # Replace ':' with '-'
    string = string.replace(':', '')
    return string

def get_dataset_categories(agency):
    return agency_to_category[agency]

In [79]:
def get_metadata_availability(dataset_id):
    """
    This function checks the metadata availability for dataset_id 432 in the backups dataframe.
    It returns "Yes" if metadata is available, "Under Review" if it needs review, and "No" otherwise.
    """
    md_avl = backups[backups.dataset_id == dataset_id].metadata_available.values
    if "yes" in md_avl:

        return "Yes",backups[backups.dataset_id == dataset_id].metadata_url.values[0]
    elif "needs review" in md_avl:
        return "Under Review",""
    else:
        return "No",""

def create_dataset_md(row):
    if row['organization'] == '':
      row['organization'] = 'Unknown'
    ## Defining the schema, filename and path
    schema = 'data_rescue_project'
    dataset_filename = slugify(row['dataset'])
    dataset_path = "_datasets"
    org_filename = slugify(row['organization'])
    org_path = "_organizations"

    ## Get backups for each dataset
    data_backups = backups[backups.dataset == row['dataset']]
    metadata_available, metadata_url = get_metadata_availability(row['dataset_id'])
    ## Creating the dataset markdown file
    ## Dataset-level information
    dataset_md = "---\n"
    dataset_md += f"schema: {schema} \n"
    dataset_md += f"title: {clean_text(row['dataset'])}\n"
    dataset_md += f"organization: {clean_text(row['organization'])}\n"
    dataset_md += f"agency: {clean_text(row['agency'])}\n"
    dataset_md += f"websites: {row['websites']}\n"
    dataset_md += f"data_source: {row['url']}\n"
    dataset_md += f"description: {clean_text(row['notes'])}\n"
    dataset_md += f"last_modified: {row['last_modified']}\n"
    dataset_md += f"last_modified_by: {row['last_modified_by']}\n"
    ## Check if any backups have metadata available and populate
    dataset_md += f"metadata_available: {metadata_available}\n"
    dataset_md += f"metadata_url: {metadata_url}\n"
    dataset_md += f"category:\n"
    dataset_md += f"  - {get_dataset_category(clean_text(row['agency']))}\n"

    dataset_md += f"resources:\n"
    ## Resource-level information
    for index, backup_row in data_backups.iterrows():
      dataset_md += f"  - id: {index}\n"
      dataset_md += f"    url: {backup_row['download_location']}\n"
      dataset_md += f"    format: {clean_text(backup_row['file_type'])}\n"
      dataset_md += f"    status: {clean_text(backup_row['status'])}\n"
      dataset_md += f"    size: {backup_row['size']}\n"
      dataset_md += f"    download_date: {backup_row['download_date']}\n"
      dataset_md += f"    maintainer: {clean_text(backup_row['maintainer'])}\n"
      dataset_md += f"    notes: {clean_text(backup_row['notes'])}\n"
    dataset_md += "---\n"
      
    ## Writing the dataset markdown file
    with open(f'{dataset_path}/{dataset_filename}.md', 'w') as output:
      output.write(dataset_md)
    
    ## Creating the organization markdown file
    org_md = "---\n"
    org_md += f"title: {clean_text(row['organization'])} \n" 
    org_md += f"description: \n" 
    org_md += "---\n"

    ## Writing the organization markdown file
    with open(f'{org_path}/{org_filename}.md', 'w') as output:
      output.write(org_md)

In [75]:
datasets.shape

(708, 9)

In [81]:
datasets.apply(create_dataset_md, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
703    None
704    None
705    None
706    None
707    None
Length: 708, dtype: object