In [1]:
from ipumspy import readers, ddi, IpumsApiClient, AggregateDataExtract, Dataset, DatasetMetadata
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import os
from matplotlib.ticker import FuncFormatter
import arcgis
from arcgis.gis import GIS
from arcgis.map import Map
from arcgis.raster import Raster
from arcgis.features import GeoAccessor
from arcgis.map.renderers import (
    ClassBreaksRenderer,
    ClassBreakInfo,
    UniqueValueRenderer,
    UniqueValueInfo,
    SizeInfoVisualVariable,
)

from arcgis.map.symbols import SimpleLineSymbolEsriSLS, SimpleFillSymbolEsriSFS

import geopandas as gpd
import glob
from zipfile import ZipFile
import earthaccess as ea
import requests
import pprint

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Check the arcgis version for mapping properly 

arcgis.__version__

'2.4.1'

In [2]:
#Load the IPUMS API key from the .env file

load_dotenv()

IPUMS_API_KEY = os.getenv("IPUMS_API_KEY")
ipums = IpumsApiClient(IPUMS_API_KEY)

In [None]:
#Submit extraction data to IPUMS portal

extract = AggregateDataExtract(
    collection="nhgis",
    description="Puerto Rico 2010-2020 vacancy",
    datasets=[
        Dataset(name="2010_SF1a", data_tables=["H1", "H3"], geog_levels=["block"]),
        Dataset(name = "2020_DHCa", data_tables = ["H1", "H3"], geog_levels = ["block"])
    ],
    geographic_extents=["720"],
    shapefiles=["720_block_2020_tl2020"] #Get the shapefile name from below chunk
)

In [30]:
#Getting shapefile metadata in order to get the filename for downloading the shapefile in the above chunk

ds = ipums.get_metadata(DatasetMetadata("nhgis", "2010_SF1a"))
for page in ipums.get_metadata_catalog("nhgis", metadata_type="shapefiles"):
    for shapefile in page["data"]:
        if shapefile["extent"] == "Puerto Rico" and shapefile["geographicLevel"] == "Block":
            print( "Name: " + shapefile["name"] + " | Year: " + shapefile["year"])

Name: 720_block_2000_tl2010 | Year: 2000
Name: 720_block_2010_tl2010 | Year: 2010
Name: 720_block_2010_tl2020 | Year: 2010
Name: 720_block_2020_tl2020 | Year: 2020


In [None]:
#Submit the extract request
ipums.submit_extract(extract)
print(f"Extract ID: {extract.extract_id}")

#Wait for the extract to finish
ipums.wait_for_extract(extract)

#Download the extract
current = os.getcwd()
DOWNLOAD_DIR = os.path.join(f"{current}/data")
ipums.download_extract(extract, download_dir=DOWNLOAD_DIR)

Extract ID: 13


In [3]:
#Reading from zip file

current = os.getcwd()
DOWNLOAD_DIR = os.path.join(f"{current}/data")

file_list = os.listdir(DOWNLOAD_DIR)
csv_zip = [f for f in file_list if f.endswith('_csv.zip')]
shape_zip = [f for f in file_list if f.endswith('_shape.zip')]
csv = f"{DOWNLOAD_DIR}/{csv_zip[0]}" 
shapefile = f"{DOWNLOAD_DIR}/{shape_zip[0]}"
csv_data = ZipFile(csv).namelist()
shape_data = ZipFile(shapefile).namelist()

In [4]:
# Check what is inside the zip
print(csv_data)
print(shape_data)

['nhgis0013_csv/nhgis0013_ds258_2020_block.csv', 'nhgis0013_csv/nhgis0013_ds258_2020_block_codebook.txt', 'nhgis0013_csv/nhgis0013_ds172_2010_block.csv', 'nhgis0013_csv/nhgis0013_ds172_2010_block_codebook.txt']
['nhgis0013_shape/nhgis0013_shapefile_tl2020_720_block_2020.zip']


In [5]:
# Read zip data file in the extract
with ZipFile(shapefile) as outer_zip: # Shapefile data has 2 zipped layers
    with outer_zip.open(shape_data[0]) as inner_zip:
        with ZipFile(inner_zip) as inner:
            inner.extractall(DOWNLOAD_DIR) #Extract the shapefile into the data folder
            

In [5]:
#Read csv data

with ZipFile(csv) as z:
    with z.open(csv_data[0]) as f: 
        df_2020 = pd.read_csv(f)
    with z.open(csv_data[2]) as f:
        df_2010 = pd.read_csv(f)

In [6]:
shp_df = pd.DataFrame.spatial.from_featureclass(location = "./data/PR_block_2020.shp", sr = 3857)

Used Earthdata to download NASA published dataset to local

In [None]:
# #Login to the Earthdata
# # Create a username and password in .env file where you stored IPUMS API key
# auth = ea.login(strategy= "environment")

## Using earthaccess to search and find 

# url = "https://cmr.earthdata.nasa.gov/search/collections?concept_id[]=C123456-LPDAAC_ECS"

# query = ea.search_datasets(keyword = "LECZ")
# for collection in query[:10]:
#     pprint.pprint(collection.summary(), sort_dicts=True, indent=4)
#     print('')  # Add a space between collections for readability

# #Download the data through the Earthdata
# Query = (
#     ea.granule_query()
#     .short_name("CIESIN_SEDAC_LECZ_URPLAEV3")
#     .debug(True)
# )

# print(f"Granule hits: {Query.hits()}")

# # Get the first 10 granules
# granules = Query.get(10)

# #Check if the granules are cloud hosted
# granules[1].cloud_hosted
# download_file = ea.download(granules[1], local_path="./data")

#Load data to the notebook

# tiff = [f for f in file_list if f.endswith('-geotiff.zip')]

# tiff_path = os.path.join(f"{DOWNLOAD_DIR}/{tiff[0]}")


# with ZipFile(tiff_path) as z:
#     z.extractall(DOWNLOAD_DIR)

In [8]:
df_2010.shape, df_2020.shape, shp_df.shape

((77189, 62), (41987, 63), (41320, 19))

In [None]:
df_2020.head()

Unnamed: 0,GISJOIN,YEAR,STUSAB,GEOID,GEOCODE,REGIONA,DIVISIONA,STATE,STATEA,COUNTY,...,NAME,FUNCSTAT,INTPTLAT,INTPTLON,LSADC,UGA,U9V001,U9X001,U9X002,U9X003
0,G72000109563001000,2020,PR,1000000US720019563001000,720019563001000,9,0,Puerto Rico,72,Adjuntas Municipio,...,Block 1000,S,18.231874,-66.771935,BK,99999,68,68,60,8
1,G72000109563001001,2020,PR,1000000US720019563001001,720019563001001,9,0,Puerto Rico,72,Adjuntas Municipio,...,Block 1001,S,18.220806,-66.776888,BK,99999,46,46,45,1
2,G72000109563001002,2020,PR,1000000US720019563001002,720019563001002,9,0,Puerto Rico,72,Adjuntas Municipio,...,Block 1002,S,18.221548,-66.782006,BK,99999,22,22,11,11
3,G72000109563001003,2020,PR,1000000US720019563001003,720019563001003,9,0,Puerto Rico,72,Adjuntas Municipio,...,Block 1003,S,18.218566,-66.764885,BK,99999,43,43,32,11
4,G72000109563001004,2020,PR,1000000US720019563001004,720019563001004,9,0,Puerto Rico,72,Adjuntas Municipio,...,Block 1004,S,18.213141,-66.761705,BK,99999,31,31,31,0


In [7]:
#Rename the columns the human readable names
# The NHGIS codes are as follows in the documentation which is downloaded from the IPUMS API 

'''    Table 1:     Housing Units
    Universe:    Housing units
    Source code: H1
    NHGIS code:  U9V
        U9V001:      Total
 
    Table 2:     Occupancy Status
    Universe:    Housing units
    Source code: H3
    NHGIS code:  U9X
        U9X001:      Total
        U9X002:      Occupied
        U9X003:      Vacant
'''

# Get the layer from the published data

gis = GIS()
layer = gis.content.get("1322372408e744b7a384cda121be8814").layers[0] # The layer of PR block 2020
sedf = pd.DataFrame.spatial.from_layer(layer) # Convert the layer to a spatial dataframe to use it in the analysis




rename = {
    "U9V001": "Total_Housing",
    "U9X001": "Total_Housing_Units",
    "U9X002": "Occupied",
    "U9X003": "Vacant"
}

df_2010.rename(columns = rename, inplace = True)
df_2020.rename(columns = rename, inplace = True)


pr_sedf = sedf.merge(df_2020[["GISJOIN", "Total_Housing", "Total_Housing_Units", "Occupied", "Vacant"]], on = "GISJOIN", how = "inner")
pr_sedf.head()

Unnamed: 0,FID,GISJOIN,STATEFP20,COUNTYFP20,TRACTCE20,BLOCKCE20,GEOID20,NAME20,MTFCC20,UR20,...,INTPTLON20,Shape_Leng,Shape_Area,Shape__Area,Shape__Length,SHAPE,Total_Housing,Total_Housing_Units,Occupied,Vacant
0,565,G72000109568002006,72,1,956800,2006,720019568002006,Block 2006,G5040,,...,-66.6900772,135.247925,563.07519,626.948242,145.165955,"{""rings"": [[[-7423910.24981173, 2054668.560003...",0,0,0,0
1,1117,G72000504001005010,72,5,400100,5010,720054001005010,Block 5010,G5040,,...,-67.105963,141.549201,652.547341,729.442383,148.74611,"{""rings"": [[[-7470173.97718421, 2095228.498668...",0,0,0,0
2,1517,G72000304305012005,72,3,430501,2005,720034305012005,Block 2005,G5040,,...,-67.1840525,89.912674,363.460529,405.625977,94.094232,"{""rings"": [[[-7478881.94680194, 2078500.902794...",0,0,0,0
3,1621,G72001108104002000,72,11,810400,2000,720118104002000,Block 2000,G5040,,...,-67.1368104,82.521448,420.691524,469.150391,87.039985,"{""rings"": [[[-7473620.09539111, 2070934.047226...",1,1,0,1
4,1841,G72001303001022012,72,13,300102,2012,720133001022012,Block 2012,G5040,,...,-66.6293854,137.197785,662.782054,740.449219,141.883715,"{""rings"": [[[-7417117.31034027, 2089267.526464...",0,0,0,0


In [11]:
lecz_layer = gis.content.search("Low Elevation Coastal Zones derived from MERIT-DEM", item_type="Imagery Layer")
for item in lecz_layer:
    display(item)
lecz = lecz_layer[1]

In [60]:
renderer_manager = m1.content.renderer(0)
smart_mapping_manager = renderer_manager.smart_mapping()
smart_mapping_manager.class_breaks_renderer(
    break_type="color",
    field="Vacant",
)

In [None]:
# Create a map
class_break_infos = [
    ClassBreakInfo(
        class_max_value=5,
        label= "< 5units",
        symbol= SimpleFillSymbolEsriSFS(
            **{
                "type":"esriSFS",
                "style": "esriSFSSolid",
                "color": [242, 164, 61],
                "outline": {"color" : [153, 153, 153, 255], "width": 0.5},
            }
        ),
    ),
    ClassBreakInfo(
        class_max_value=100,
        label= "< 100units",
        symbol= SimpleFillSymbolEsriSFS(
            **{
                "type":"esriSFS",
                "style": "esriSFSSolid",
                "color": [255, 251, 0],
                "outline": {"color" : [153, 153, 153, 255], "width": 0.5},
            }
        ),
    ),
    ClassBreakInfo(
        class_max_value=1692,
        label= "< 1692 units",
        symbol= SimpleFillSymbolEsriSFS(
            **{
                "type":"esriSFS",
                "style": "esriSFSSolid",
                "color": [252, 89, 61],
                "outline": {"color" : [153, 153, 153, 255], "width": 0.5},
            }
        ),
    ),
]

class_break_renderer = ClassBreaksRenderer(
    field="Total_Housing",
    class_break_infos=class_break_infos,
    normalizationType= None,
)

In [13]:
m2 = Map("Puerto Rico")
m2.content.add(pr_sedf, drawing_info={"renderer":class_break_renderer.dict()})
m2.content.add(lecz, options={"opacity": 0.5})

In [15]:
m2.legend.enabled = True

In [14]:
m2

Map(center=[2063620.7690671808, -7399290.049500733], extent={'xmin': -7498920.993760712, 'ymin': 1958993.94058…

In [25]:
# pr_sedf[["Total_Housing", "Vacant"]][
pr_sedf["Vacant"].max()

1692