In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import rasterio
from rasterio import features as rf
%matplotlib inline

ztrax_data_filepath = 'data/ZTRAX-vintage/FBUY.tif'
tracts_shapefile_path = 'data/tracts_shapefile'
output_filepath = 'data/ztrax_years.csv'

In [2]:
%%time
# load the raster and convert to vector geodataframe
results = []
tif_file = rasterio.open(ztrax_data_filepath)
    
print(tif_file.count, tif_file.width, tif_file.height)
print(tif_file.bounds)

1 18459 11615
BoundingBox(left=-2356398.7592963483, bottom=269249.2873629322, right=2258351.2407036517, top=3172999.287362932)
Wall time: 40 ms


In [3]:
tif_crs = tif_file.crs
affine = tif_file.meta.get('transform')
band = tif_file.read(1).astype(int)

In [4]:
df = pd.DataFrame(band)
df.stack().value_counts().sort_index()

0       201820910
1         2614442
1810        22134
1815         4240
1820        11283
1825         6400
1830        14694
1835         6934
1840        19710
1845         6591
1850        48652
1855         9478
1860        35420
1865        15854
1870        40911
1875        28480
1880        82445
1885        29181
1890       105996
1895        34845
1900       379354
1905        92383
1910       160151
1915        96940
1920       259127
1925       144943
1930       261376
1935       130569
1940       314064
1945       177341
1950       446286
1955       353382
1960       487316
1965       446839
1970       545807
1975       674462
1980       770543
1985       552698
1990       630729
1995       608398
2000       735502
2005       628228
2010       393989
2015       152258
dtype: int64

In [5]:
mask = (df > 1800).values

In [6]:
%%time
shapes = rf.shapes(band, mask=mask, transform=affine)
#print(len(list(shapes)))

Wall time: 1.03 ms


In [7]:
%%time
for geometry, date in rf.shapes(band, mask=mask, transform=affine):
    results.append({'properties': {'date': date}, 'geometry': geometry})
len(results)

Wall time: 1min 30s


7751549

In [8]:
%%time
gdf_tif = gpd.GeoDataFrame.from_features(results)
gdf_tif.crs = tif_crs
print(gdf_tif.shape)

(7751549, 2)
Wall time: 55.8 s


In [9]:
gdf_tif['date'] = gdf_tif['date'].astype(int)
gdf_tif.sample(5)

Unnamed: 0,geometry,date
7519793,"POLYGON ((70351.24070365168 734249.2873629322,...",1960
4610879,"POLYGON ((1676851.240703652 1563499.287362932,...",1960
1410542,POLYGON ((-57148.75929634832 2282749.287362932...,1890
3819832,"POLYGON ((1307851.240703652 1771499.287362932,...",1920
4837536,"POLYGON ((1057601.240703652 1519249.287362932,...",1935


In [10]:
%%time
# load tracts shapefile
tracts = gpd.read_file(tracts_shapefile_path)
tracts.shape

Wall time: 8.22 s


(74133, 13)

In [11]:
%%time
# project tracts to raster's CRS if they're not already the same
print(tracts.crs == gdf_tif.crs)
if tracts.crs != gdf_tif.crs:
    tracts = tracts.to_crs(gdf_tif.crs)
print(tracts.crs == gdf_tif.crs)

False
True
Wall time: 17.7 s


In [12]:
# use only tract GEOID and geometry for the spatial join
gdf_tracts = tracts[['GEOID', 'geometry']]
print(gdf_tracts.shape)
print(gdf_tif.shape)

(74133, 2)
(7751549, 2)


In [13]:
%%time
# spatial join each grid cell to the tract(s) it intersects
# this attaches tract geoid to each grid cell
gdf = gpd.sjoin(gdf_tif, gdf_tracts, how='left', op='intersects')
print(gdf.shape)

(8970239, 4)
Wall time: 18min 46s


Some sq km grid cells overlap with multiple census tracts. Some census tracts overlap with multiple sq km grid cells. Group by census tract then calculate the mean value across the grid cells with which it intersects.

In [14]:
# typical tract (median) overlaps 44 grid cells
groups = gdf.groupby('GEOID')['date']
groups.count().describe().round(2)

count    68758.00
mean       130.46
std        201.05
min          1.00
25%         20.00
50%         44.00
75%        135.00
max       1906.00
Name: date, dtype: float64

In [15]:
# calculate the mean year across the grid cells with which each tract intersects
tract_year_mean = groups.mean()
tract_year_median = groups.median()
tract_year_min = groups.min()
tract_year_std = groups.std()

In [16]:
year_cols = ['year_min', 'year_median', 'year_mean', 'year_std']
data = [tract_year_min, tract_year_median, tract_year_mean, tract_year_std]
tract_year = pd.DataFrame(data, index=year_cols).T

In [17]:
tract_year['year_mean_std'] = tract_year['year_mean'] - tract_year['year_std']
tract_year.describe()

Unnamed: 0,year_min,year_median,year_mean,year_std,year_mean_std
count,68758.0,68758.0,68758.0,68330.0,68330.0
mean,1883.123346,1950.962506,1948.955848,26.642609,1922.285771
std,44.983157,31.340781,29.136767,11.813116,33.192354
min,1810.0,1810.0,1810.0,0.0,1769.614178
25%,1850.0,1930.0,1930.9375,18.177004,1900.336879
50%,1890.0,1957.5,1953.75,25.597764,1924.554178
75%,1910.0,1975.0,1970.899943,33.811506,1946.335204
max,2015.0,2015.0,2015.0,137.885822,2010.446582


In [18]:
np.random.seed(0)
tract_year.sample(10).astype(int)

Unnamed: 0_level_0,year_min,year_median,year_mean,year_std,year_mean_std
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17031803604,1895,1967,1958,27,1931
6031001402,1900,1930,1933,25,1908
49049000801,1885,1945,1945,25,1920
45041000101,1870,1980,1978,26,1952
41071030702,1860,1940,1942,37,1904
6075031302,1905,1912,1918,15,1903
42071014000,1810,1965,1942,52,1890
6109005100,1855,1975,1973,26,1946
29183311801,1840,1975,1971,30,1940
47011010200,1900,1955,1955,26,1928


## Merge years with tract indicators then save to disk

In [19]:
ztrax_years = pd.DataFrame(tract_year)
ztrax_years.head()

Unnamed: 0_level_0,year_min,year_median,year_mean,year_std,year_mean_std
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001020100,1825.0,1955.0,1949.056604,37.576039,1911.480565
1001020200,1830.0,1927.5,1923.362069,41.795114,1881.566955
1001020300,1865.0,1955.0,1951.858974,31.355823,1920.503151
1001020400,1870.0,1965.0,1965.703125,23.022887,1942.680238
1001020500,1890.0,1990.0,1984.791667,21.484313,1963.307354


In [20]:
ztrax_years.to_csv(output_filepath, index=True, encoding='utf-8')