In [7]:
import requests
from pathlib import Path

# This function creates a path to a directory that will contain data files. It ensures that the directory exists 
# (which is required to write files in that directory), then proceeds to download the file based on its URL.

def fetch_and_cache(data_url, file, data_dir="data", force=False):
    """
    Download and cache a url and return the file object.
    
    data_url: the web address to download
    file: the file in which to save the results.
    data_dir: (default="data") the location to save the data
    force: if true the file is always re-downloaded 
    
    return: The pathlib.Path to the file.
    """
    data_dir = Path(data_dir)# creates path for directory
    data_dir.mkdir(exist_ok=True) # creates directory
    file_path = data_dir/Path(file) # creates path for file
    if force and file_path.exists(): # check if force is true and the file path exists
        file_path.unlink() # if so removes symbolic link
    if force or not file_path.exists(): # if force is true or the file path doesn't exist 
        print('Downloading...', end=' ')
        resp = requests.get(data_url)
        with file_path.open('wb') as f:
            f.write(resp.content)
        print('Done!')
    else:
        import time 
        created = time.ctime(file_path.stat().st_ctime)
        print("Using cached version downloaded at", created)
    return file_path

In [9]:
data_url = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip'
namesbystate_path = fetch_and_cache(data_url, 'namesbystate.zip')

Using cached version downloaded at Mon May 26 15:28:02 2025


In [11]:
import zipfile
zf = zipfile.ZipFile(namesbystate_path, 'r') # constructor used to read zip file using path

column_labels = ['State', 'Sex', 'Year', 'Name', 'Count']

def load_dataframe_from_zip(zf, f):
    with zf.open(f) as fh: # opens specific file in zip file 
        return pd.read_csv(fh, header=None, names=column_labels) # reads in that file using pd.read_csv and returns a pd dataframe

# the following line will call load_dataframe_from_zip for every txt file in the zip file and return a list of pd dataframes (one per state)
states = [load_dataframe_from_zip(zf, f) for f in sorted(zf.filelist, key=lambda x:x.filename) if f.filename.endswith('.TXT')]

baby_names = states[0] # pull first dataframe to create one we can concatenate tham all in
for state_df in states[1:]: # loop through rest of items
    baby_names = pd.concat([baby_names, state_df]) # concatenate each item into baby_names
baby_names = baby_names.reset_index().iloc[:, 1:] # reset index to be unique

BadZipFile: File is not a zip file