In [21]:
import pprint
import json
import requests
import os

import pandas as pd
import gzip


In [15]:
# get root dir which ends in repo_name
repo_name = 'Fed_IT_Employment'
root = os.getcwd()
while os.path.basename(root) != repo_name:
    root = os.path.dirname(root)

# Get raw data directory
rdir = os.path.join( root, 'data', 'raw_data')

pdir = os.path.join( root, 'data', 'processed_data')

print(f"Base directory: {root}\nRaw data directory: {rdir}")

Base directory: /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment
Raw data directory: /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment/data/raw_data


## Set up IPUMS API

In [3]:
# Set IPUMS API variables
collection = 'usa'
version = '2'

# Read in IPUMS_api_key from api_keys.json
with open( os.path.join(root, 'api_keys.json')) as f:
    api_keys = json.load(f)
IPUMS_api_key = api_keys['IPUMS_api_key']

url = f'https://api.ipums.org/extracts?collection={collection}&version={version}'

headers = {
    'Authorization': f'{IPUMS_api_key}',
    'Content-Type': 'application/json'
}

# Check past extract requests for this collection

In [4]:

# Send a GET request
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    if 'data' in data:
        data = data['data']
    print("Extracts:")
    print(f"...Type: {type(data)}")
    print(f"...Length: {len(data)}")
    pprint.pprint(data)
else:
    print(f"Failed to retrieve extracts. Status code: {response.status_code}")
    print("Response:", response.text)

Extracts:
...Type: <class 'list'>
...Length: 4
[{'downloadLinks': {'basicCodebook': {'bytes': 57446,
                                      'sha256': 'e4170ae42103dbdbad82d473d6ec10d93d5d71f37ad764985040251cb7c496ee',
                                      'url': 'https://api.ipums.org/downloads/usa/api/v1/extracts/2351804/usa_00004.cbk'},
                    'data': {'bytes': 221741052,
                             'sha256': '3bfad877e53cd1fa8177d3153467535a018ccd930844e376c0809bbe4b798725',
                             'url': 'https://api.ipums.org/downloads/usa/api/v1/extracts/2351804/usa_00004.dat.gz'},
                    'ddiCodebook': {'bytes': 152907,
                                    'sha256': '1747b3677594d8cb843472b9cb883b37e169fe39103471f3b332af40e0f3bd66',
                                    'url': 'https://api.ipums.org/downloads/usa/api/v1/extracts/2351804/usa_00004.xml'},
                    'rCommandFile': {'bytes': 406,
                                     'sha256': '

# Make an extract request

In [5]:

acs_samples = { f'us{y}a':{} for y in range(2005, 2021, 5)}
payload = {
    "description": "ACS Extract Attempt",
    "dataStructure": {
        "rectangular": {
            "on": "P"
        }
    },
    "dataFormat": "fixed_width",
    "samples": acs_samples ,
    "variables": {
        "AGE": {},
        "SEX": {},
        "RACE": {},
        "STATEFIP": {},
        "EDUC" : {},
        "OCC2010": {},
        "INDNAICS": {},
        "INCTOT" : {},
        "SPMPOV": {},
        "OFFPOV": {},
        "DISABWRK": {},
        "STATEFIP": {},
        "PERWT" : {}
    }
}

response = requests.post(url, headers=headers, json=payload)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    extract_number = data['number']
    print("Extract number:", extract_number)
    print("Status:", data['status'])
    print("Download Links:", data.get('downloadLinks', {}))
else:
    print("Failed to create extract. Status code:", response.status_code)
    print("Response:", response.text)


Extract number: 5
Status: queued
Download Links: {}


# Check status of request, select most recent request if many exist

In [12]:
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    out = response.json()
    
    if 'data' in out:
      max_version = 1
      for extract in out['data']:
        if extract['number'] > max_version:
          max_version = extract['number']
          out = extract
      extract_number = max_version
      print(f"Multiple extracts found. Using the latest extract: {extract_number}")

    else:
       extract_number = out['number']

    print(f"Extraction status for extract number {extract_number} is:\n{out['status']}\n(Options: `queued`, `started`, `produced` `canceled`, `failed` or `completed`)")
else:
    print("Failed to fetch extract details. Status code:", response.status_code)
    print("Response:", response.text)


Multiple extracts found. Using the latest extract: 5
Extraction status for extract number 5 is:
completed
(Options: `queued`, `started`, `produced` `canceled`, `failed` or `completed`)


# Get download link for data if ready

In [27]:
if 'data' in out['downloadLinks']:
  data_url = out['downloadLinks']['data']['url']
  codebook_url = out['downloadLinks']['basicCodebook']['url']
  ddi_url = out['downloadLinks']['ddiCodebook']['url']
  dat_url = out['downloadLinks']['dataFile']['url']
  r_extract_script = out['downloadLinks']['rCommandFile']['url']
  print(f"Data Extract is Ready.\nDownload link: {data_url}")
else:
  print("Data Extract is not ready")

Data Extract is Ready.
Download link: https://api.ipums.org/downloads/usa/api/v1/extracts/2352689/usa_00005.dat.gz


# Download Data

In [28]:

headers = {
    'Authorization': f'{IPUMS_api_key}'
}


data_fname = os.path.join(pdir, 'IPUMS', f'IPUMS_{collection}_v{version}_extract{extract_number}.gz')
cb_fnmae = os.path.join(pdir, 'IPUMS', f'IPUMS_{collection}_v{version}_extract{extract_number}_codebook.cbk')
ddi_fname = os.path.join(pdir, 'IPUMS', f'IPUMS_{collection}_v{version}_extract{extract_number}_ddi.xml')

# List of URLs and corresponding filenames
downloads = [
    {"url": codebook_url, "fname": cb_fnmae, "desc": "Codebook"},
    {"url": ddi_url, "fname": ddi_fname, "desc": "DDI"},
    {"url": data_url, "fname": data_fname, "desc": "Data"}
]

# Loop through each download item
for item in downloads:
    if item["url"]:
        # Download file
        response = requests.get(item["url"], headers=headers, stream=True)

        # Check if the request was successful
        if response.status_code == 200:
            # Open a local file in binary write mode
            os.makedirs(os.path.join(pdir, 'IPUMS'), exist_ok=True)
            with open(item["fname"], 'wb') as file:
                # Write the response content in chunks
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"{item['desc']} downloaded successfully to {item['fname']}")
        else:
            print(f"Failed to download the {item['desc']}. Status code: {response.status_code}")


Codebook downloaded successfully to /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment/data/processed_data/IPUMS/IPUMS_usa_v2_extract5_codebook.cbk
DDI downloaded successfully to /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment/data/processed_data/IPUMS/IPUMS_usa_v2_extract5_ddi.xml
R script downloaded successfully to https://api.ipums.org/downloads/usa/api/v1/extracts/2352689/usa_00005.R
File downloaded successfully to /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment/data/processed_data/IPUMS/IPUMS_usa_v2_extract5.gz


## Process the GZ file using the ipumsr R package
This cell runs an R script

In [63]:
# Get just filename of ddi_fname without path
ddi_fname_no_path = os.path.basename(ddi_fname)

# get just filename of data_fname without path
data_fname_no_path = os.path.basename(data_fname)

# Get processed data directory starting at root 
pdir_after_root = os.path.join( 'data', 'processed_data','IPUMS')

# pass in directory with data and ddi_fname as an argument to the R script
!Rscript process_IPUMS_gz_file.R $pdir_after_root $ddi_fname_no_path $data_fname_no_path


[1] "Starting the process_IPUMS_gz_file.R script..."
[1] "Successfully changed to the root directory: "                                                               
[2] "/Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment"
Changing the working directory to: data/processed_data/IPUMS 
Working directory changed to: /Users/coltonlapp/Dropbox/My Mac (Coltons-MacBook-Pro.local)/Desktop/Work/USDC/publicwork/Fed_IT_Employment/data/processed_data/IPUMS 
Loading required package: ipumsr
package ‘ipumsr’ was built under R version 4.3.3 
[?25h[?25h[1] "Successfully loaded the ipumsr package"
[?25h[1] "Reading the DDI file..."
[?25h[?25h[1] "Reading the data file..."
[?25hUse of data from IPUMS USA is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.
[?25h[?25hSaving the data to a csv file: IPUMS_usa_v2_extract5.csv 
[?25h[?25h[?25h