# Hudson Valley Gentrification and Changes in Farming

## Notebook 2: USDA Agriculture Census data

In [59]:
import pandas as pd
import requests
import os
import geopandas as gpd

In [2]:
api_key_nass = os.getenv("QUICK_STATS_API_KEY")

In [60]:
gdf_zipcodes = gpd.read_file("data/zctas_core.geojson")

In [61]:
gdf_zipcodes.head(1)

Unnamed: 0,ZCTA5CE20,GEOID20,CLASSFP20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,geometry
0,12577,12577,B5,G6350,S,19999901,169531,41.4214832,-74.1168149,"POLYGON ((288257.82 142661.193, 288349.37 1426..."


In [62]:
zipcodes = gdf_zipcodes.ZCTA5CE20.values

In [63]:
zipcodes

array(['12577', '10953', '10930', '12158', '10990', '12054', '10950',
       '12143', '12529', '12586', '12589', '10980', '10509', '12545',
       '12543', '12582', '12729', '10933', '12518', '06757', '06068',
       '06784', '06069', '06039', '12123', '12169', '12427', '12015',
       '12423', '12444', '01254', '01237', '01258', '18458', '18405',
       '18461', '18417', '18415', '18435', '18336', '18340', '01267',
       '01230', '01201', '01266', '07422', '07827', '07456', '07418',
       '07462', '07461', '07421', '12590', '12062', '12168', '12018',
       '12468', '12482', '12042', '12569', '12424', '12473', '12553',
       '10969', '12460', '10975', '12538', '12138', '12153', '12061',
       '12603', '12413', '12051', '12480', '12439', '12485', '12463',
       '12581', '12734', '12448', '12787', '12428', '12726', '12762',
       '12751', '12421', '12513', '12173', '12195', '12498', '12490',
       '12457', '12477', '12791', '12741', '12776', '12733', '12481',
       '12494', '127

## Download Economics and Demographics sectors from Agriculture Census and identify relevant variables

In [None]:
source_desc = "CENSUS"
sector_desc = ["Economics", "Demographics"]
group_desc = ['EXPENSES','FARMS & LAND & ASSETS','INCOME','PRODUCERS']
# These are the columns, in order, that (within a given sector) sequentially identify variables at any given level
group_cols = ["group_desc","commodity_desc", "class_desc", "prodn_practice_desc", "statisticcat_desc", "unit_desc","domain_desc","domaincat_desc"]

### Check whether there are fewer than 50,000 results at each specified year/level/sector

In [4]:
num_results = []
for year in ["2017","2022"]:
    for level in ["ZIP CODE", 'COUNTY']:
        for sector in sector_desc:
            r = requests.get(url=f'https://quickstats.nass.usda.gov/api/get_counts/?key={api_key_nass}', params={"sector_desc": sector, 
                                                                                                            "year": year, 
                                                                                                            "state_fips_code": "36", 
                                                                                                            "agg_level_desc": level}).json()
            num_results.append((year, level, sector, r['count']))

In [5]:
num_results

[('2017', 'ZIP CODE', 'Economics', 24414),
 ('2017', 'ZIP CODE', 'Demographics', 19578),
 ('2017', 'COUNTY', 'Economics', 24958),
 ('2017', 'COUNTY', 'Demographics', 17316),
 ('2022', 'ZIP CODE', 'Economics', 23609),
 ('2022', 'ZIP CODE', 'Demographics', 18113),
 ('2022', 'COUNTY', 'Economics', 24697),
 ('2022', 'COUNTY', 'Demographics', 13755)]

### Perform API calls for specified year/aggregation/sector; store results and transform into dataframes

In [16]:
ag_census_results = {}
for year in ["2017","2022"]:
    ag_census_results[year] = {}
    for level in ["zip code", 'county']:
        ag_census_results[year][level] = {}
        for sector in sector_desc:
            r = requests.get(url=f"https://quickstats.nass.usda.gov/api/api_GET/?key={api_key_nass}", 
                            params={"sector_desc": sector, 
                                    "year": year, 
                                    "state_fips_code": "36", 
                                    "agg_level_desc": level}).json()
            ag_census_results[year][level][sector] = r

In [17]:
for year in ["2017","2022"]:
    for level in ["zip code", 'county']:
        for sector in sector_desc:
            ag_census_results[year][level][sector]['df'] = pd.DataFrame.from_dict(ag_census_results[year][level][sector]['data'])

In [18]:
ag_census_results['2017']['county']['Economics']['df'].group_desc.unique()

array(['ENERGY', 'EXPENSES', 'FARMS & LAND & ASSETS', 'INCOME'],
      dtype=object)

In [24]:
for col in ag_census_results['2017']['county']['Economics']['df'].groupby(group_cols).size().reset_index().drop(0, axis=1).columns:
    print(col)
    print(ag_census_results['2017']['county']['Economics']['df'].groupby(group_cols).size().reset_index().drop(0, axis=1)[col].unique())

group_desc
['ENERGY' 'EXPENSES' 'FARMS & LAND & ASSETS' 'INCOME']
commodity_desc
['ENERGY' 'AG SERVICES' 'ANIMAL TOTALS' 'CHEMICAL TOTALS' 'DEPRECIATION'
 'EXPENSE TOTALS' 'FEED' 'FERTILIZER TOTALS' 'FUELS' 'INTEREST' 'LABOR'
 'RENT' 'SEEDS' 'SEEDS & PLANTS TOTALS' 'SUPPLIES & REPAIRS' 'TAXES'
 'AG LAND' 'FARM OPERATIONS' 'GOVT PROGRAMS' 'LAND AREA'
 'MACHINERY TOTALS' 'MACHINERY, OTHER' 'PRACTICES' 'SELF PROPELLED'
 'TRACTORS' 'TRUCKS' 'CCC LOANS' 'COMMODITY TOTALS' 'INCOME, FARM-RELATED'
 'INCOME, NET CASH FARM']
class_desc
['RENEWABLE, HARVEST BIOMASS FOR PRODUCTION'
 'CUSTOM SERVICES FOR LIVESTOCK, INCL MEDICAL SUPPLIES & VETERINARY'
 'CUSTOMWORK' 'MACHINERY RENTAL' 'OTHER' 'UTILITIES' '(EXCL BREEDING)'
 'ALL CLASSES' 'BREEDING' 'OPERATING' 'OPERATING, PAID BY LANDLORD'
 'INCL LIME & SOIL CONDITIONERS' 'INCL LUBRICANTS' 'NON-REAL ESTATE'
 'REAL ESTATE' 'CONTRACT' 'HIRED' 'HIRED, GE 150 DAYS'
 'HIRED, LT 150 DAYS' 'MIGRANT' 'UNPAID' 'CASH, CROPLAND'
 'CASH, LAND & BUILDINGS' 'CASH, 

In [26]:
for col in ag_census_results['2017']['county']['Demographics']['df'].groupby(group_cols).size().reset_index().drop(0, axis=1).columns:
    print(col)
    print(ag_census_results['2017']['county']['Demographics']['df'].groupby(group_cols).size().reset_index().drop(0, axis=1)[col].unique())

group_desc
['ANIMAL TOTALS' 'AQUACULTURE' 'CROP TOTALS' 'DAIRY' 'EXPENSES'
 'FARMS & LAND & ASSETS' 'FIELD CROPS' 'FRUIT & TREE NUTS' 'HORTICULTURE'
 'INCOME' 'LIVESTOCK' 'POULTRY' 'PRODUCERS' 'SPECIALTY' 'VEGETABLES']
commodity_desc
['ANIMAL TOTALS' 'AQUACULTURE TOTALS' 'CROP TOTALS' 'MILK'
 'EXPENSE TOTALS' 'LABOR' 'AG LAND' 'FARM OPERATIONS' 'INTERNET'
 'PRACTICES' 'BARLEY' 'CORN' 'FIELD CROPS, OTHER' 'GRAIN' 'HAY & HAYLAGE'
 'OATS' 'SOYBEANS' 'WHEAT' 'APPLES' 'FRUIT & TREE NUT TOTALS' 'GRAPES'
 'PEACHES' 'CUT CHRISTMAS TREES'
 'CUT CHRISTMAS TREES & SHORT TERM WOODY TREES' 'HORTICULTURE TOTALS'
 'NURSERY TOTALS' 'SOD' 'COMMODITY TOTALS' 'GOVT PROGRAMS'
 'INCOME, FARM-RELATED' 'INCOME, NET CASH FARM' 'CATTLE' 'GOATS' 'HOGS'
 'SHEEP' 'SHEEP & GOATS TOTALS' 'CHICKENS' 'POULTRY TOTALS' 'TURKEYS'
 'PRODUCERS' 'PRODUCERS, PRINCIPAL' 'EQUINE' 'SPECIALTY ANIMAL TOTALS'
 'POTATOES' 'SQUASH' 'SWEET CORN' 'VEGETABLE TOTALS']
class_desc
['INCL PRODUCTS' 'ALL CLASSES' 'OPERATING' 'HIRED'
 '(EXC

In [None]:
# save raw data to CSVs
for year in ["2017","2022"]:
    for level in ["zip code", 'county']:
        for sector in sector_desc:
            ag_census_results[year][level][sector]['df'].to_csv(f"data/ag_census_{"".join(level.split(" "))}_{year}_{sector}.csv", index=False, encoding='utf-8')

In [None]:
# save variables to CSVs
for year in ["2017","2022"]:
    for level in ["zip code", 'county']:
        for sector in sector_desc:
            df = ag_census_results[year][level][sector]['df'].groupby(group_cols).size().reset_index().drop(0, axis=1)
            df.to_csv(f"data/ag_census_vars/ag_census_{"".join(level.split(" "))}_{year}_{sector}_variables.csv", index=False, encoding='utf-8')

Select relevant variables and prepare data for analysis

In [None]:
# Make separate dataframe for each universe and year

econ2017_zipcode_df = ag_census_results['2017']['zip code']['Economics']['df']
econ2022_zipcode_df = ag_census_results['2022']['zip code']['Economics']['df']
econ2017_county_df = ag_census_results['2017']['county']['Economics']['df']
econ2022_county_df = ag_census_results['2022']['county']['Economics']['df']

demog2017_zipcode_df = ag_census_results['2017']['zip code']['Demographics']['df']
demog2022_zipcode_df = ag_census_results['2022']['zip code']['Demographics']['df']
demog2017_county_df = ag_census_results['2017']['county']['Demographics']['df']
demog2022_county_df = ag_census_results['2022']['county']['Demographics']['df']

In [368]:
# Select just variables that I want to include in my study

econ2017_zipcode_df_subset = econ2017_zipcode_df.query("group_desc in ['FARMS & LAND & ASSETS','INCOME'] and commodity_desc in ['AG LAND','COMMODITY TOTALS'] and class_desc in ['CROPLAND, HARVESTED','ALL CLASSES'] and prodn_practice_desc != 'ORGANIC'").copy()
econ2022_zipcode_df_subset = econ2022_zipcode_df.query("group_desc in ['FARMS & LAND & ASSETS','INCOME'] and commodity_desc in ['AG LAND','COMMODITY TOTALS'] and class_desc in ['CROPLAND, HARVESTED','ALL CLASSES'] and prodn_practice_desc != 'ORGANIC'").copy()

In [369]:
# Select just variables that I want to include in my study

demog2017_zipcode_df_subset = demog2017_zipcode_df.query("domain_desc == 'TENURE' or prodn_practice_desc == 'PRIMARY OCCUPATION, FARMING'").copy()
demog2022_zipcode_df_subset = demog2022_zipcode_df.query("domain_desc == 'TENURE' or prodn_practice_desc == 'PRIMARY OCCUPATION, FARMING'").copy()

In [370]:
# For zipcode dataframes, make a smaller set just limited to zipcodes in target area

econ2017_zipcode_df_core = econ2017_zipcode_df_subset.query("zip_5 in @zipcodes").copy()
econ2022_zipcode_df_core = econ2022_zipcode_df_subset.query("zip_5 in @zipcodes").copy()
demog2017_zipcode_df_core = demog2017_zipcode_df_subset.query("zip_5 in @zipcodes").copy()
demog2022_zipcode_df_core = demog2022_zipcode_df_subset.query("zip_5 in @zipcodes").copy()

In [371]:
# Select just variables that I want to include in my study

econ2017_county_df_subset = econ2017_county_df.query("(((group_desc == 'EXPENSES' or group_desc == 'FARMS & LAND & ASSETS') and (commodity_desc == 'LABOR' or commodity_desc == 'AG LAND' or commodity_desc == 'LAND AREA') and unit_desc != 'OPERATIONS' and (domain_desc == 'TOTAL' or domain_desc == 'LABOR') and (class_desc == 'HIRED' or class_desc == 'CROPLAND' or class_desc == 'INCL BUILDINGS' or class_desc == 'INCL NON-AG' or class_desc == 'CONTRACT') and unit_desc != '$ / OPERATION') or (group_desc == 'INCOME' and (commodity_desc == 'COMMODITY TOTALS' or commodity_desc == 'INCOME, NET CASH FARM') and unit_desc == '$' and domaincat_desc == 'NOT SPECIFIED')) and prodn_practice_desc == 'ALL PRODUCTION PRACTICES'").copy()
econ2022_county_df_subset = econ2022_county_df.query("(((group_desc == 'EXPENSES' or group_desc == 'FARMS & LAND & ASSETS') and (commodity_desc == 'LABOR' or commodity_desc == 'AG LAND' or commodity_desc == 'LAND AREA') and unit_desc != 'OPERATIONS' and (domain_desc == 'TOTAL' or domain_desc == 'LABOR') and (class_desc == 'HIRED' or class_desc == 'CROPLAND' or class_desc == 'INCL BUILDINGS' or class_desc == 'INCL NON-AG' or class_desc == 'CONTRACT') and unit_desc != '$ / OPERATION') or (group_desc == 'INCOME' and (commodity_desc == 'COMMODITY TOTALS' or commodity_desc == 'INCOME, NET CASH FARM') and unit_desc == '$' and domaincat_desc == 'NOT SPECIFIED')) and prodn_practice_desc == 'ALL PRODUCTION PRACTICES'").copy()

In [372]:
# Select just variables that I want to include in my study

demog2017_county_df_subset = demog2017_county_df.query("(group_desc in ['FARMS & LAND & ASSETS','PRODUCERS'] and commodity_desc in ['AG LAND','FARM OPERATIONS','PRODUCERS'] and domain_desc in ['TENURE','TOTAL'] and unit_desc in ['PRODUCERS','ACRES','YEARS'] and class_desc in ['ALL CLASSES','(ALL)','CROPLAND, HARVESTED','AGE 25 TO 34','AGE 35 TO 44','AGE 45 TO 54','AGE 55 TO 64','AGE 65 TO 74','AGE GE 75','AGE LT 25'] and prodn_practice_desc in ['OWNED, IN FARMS','RENTED FROM OTHERS, IN FARMS','ALL PRODUCTION PRACTICES','PRIMARY OCCUPATION, FARMING','YEARS ON ANY OPERATION, 6 TO 10 YEARS','YEARS ON ANY OPERATION, GE 11 YEARS','YEARS ON ANY OPERATION, LT 11 YEARS','YEARS ON ANY OPERATION, LT 6 YEARS','YEARS ON PRESENT OPERATION, 3 TO 4 YEARS','YEARS ON PRESENT OPERATION, 5 TO 9 YEARS','YEARS ON PRESENT OPERATION, GE 10 YEARS','YEARS ON PRESENT OPERATION, LT 3 YEARS'])").copy()
demog2022_county_df_subset = demog2022_county_df.query("(group_desc in ['FARMS & LAND & ASSETS','PRODUCERS'] and commodity_desc in ['AG LAND','FARM OPERATIONS','PRODUCERS'] and domain_desc in ['TENURE','TOTAL'] and unit_desc in ['PRODUCERS','ACRES','YEARS'] and class_desc in ['ALL CLASSES','(ALL)','CROPLAND, HARVESTED','AGE 25 TO 34','AGE 35 TO 44','AGE 45 TO 54','AGE 55 TO 64','AGE 65 TO 74','AGE GE 75','AGE LT 25'] and prodn_practice_desc in ['OWNED, IN FARMS','RENTED FROM OTHERS, IN FARMS','ALL PRODUCTION PRACTICES','PRIMARY OCCUPATION, FARMING','YEARS ON ANY OPERATION, 6 TO 10 YEARS','YEARS ON ANY OPERATION, GE 11 YEARS','YEARS ON ANY OPERATION, LT 11 YEARS','YEARS ON ANY OPERATION, LT 6 YEARS','YEARS ON PRESENT OPERATION, 3 TO 4 YEARS','YEARS ON PRESENT OPERATION, 5 TO 9 YEARS','YEARS ON PRESENT OPERATION, GE 10 YEARS','YEARS ON PRESENT OPERATION, LT 3 YEARS'])").copy()

In [376]:
demog2017_county_df_subset[group_cols].query("group_desc == 'PRODUCERS'").drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
5505,PRODUCERS,PRODUCERS,(ALL),ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5688,PRODUCERS,PRODUCERS,AGE 25 TO 34,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5745,PRODUCERS,PRODUCERS,AGE 35 TO 44,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5806,PRODUCERS,PRODUCERS,AGE 45 TO 54,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5865,PRODUCERS,PRODUCERS,AGE 55 TO 64,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5925,PRODUCERS,PRODUCERS,AGE 65 TO 74,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5984,PRODUCERS,PRODUCERS,AGE GE 75,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6218,PRODUCERS,PRODUCERS,AGE LT 25,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6270,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED
7147,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"YEARS ON ANY OPERATION, AVG",YEARS,TOTAL,NOT SPECIFIED


In [375]:
demog2022_county_df_subset[group_cols].query("group_desc == 'PRODUCERS'").drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
5701,PRODUCERS,PRODUCERS,(ALL),ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5887,PRODUCERS,PRODUCERS,AGE 25 TO 34,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5945,PRODUCERS,PRODUCERS,AGE 35 TO 44,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6007,PRODUCERS,PRODUCERS,AGE 45 TO 54,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6066,PRODUCERS,PRODUCERS,AGE 55 TO 64,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6126,PRODUCERS,PRODUCERS,AGE 65 TO 74,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6188,PRODUCERS,PRODUCERS,AGE GE 75,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6247,PRODUCERS,PRODUCERS,AGE LT 25,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
6480,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED
7359,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED


In [None]:
# Join variable subsets together by zipcode/county level and year

county2017_df = pd.concat([econ2017_county_df_subset,demog2017_county_df_subset]).copy().reset_index(drop=True)
county2022_df = pd.concat([econ2022_county_df_subset,demog2022_county_df_subset]).copy().reset_index(drop=True)
zipcode2017_df = pd.concat([econ2017_zipcode_df_core,demog2017_zipcode_df_core]).copy().reset_index(drop=True)
zipcode2022_df = pd.concat([econ2022_zipcode_df_core,demog2022_zipcode_df_core]).copy().reset_index(drop=True)

Cast 'Value' and 'CV (%)' columns as numbers, and replace text placeholders with approximate values.

From the [NASS QuickStats glossary](https://quickstats.nass.usda.gov/src/glossary.pdf):
* CV (%) = Coefficient of variation = Ratio of the standard error to the estimate, expressed as a percent.
* (D) = Withheld to avoid disclosing data for individual operations.
* (H) = Coefficient of variation or generalized coefficient of variation is greater than or equal to 99.95 percent or the standard error is greater than or equal to 99.95 percent of the mean.
* (L) = Coefficient of variation or generalized coefficient of variation is less than 0.05 percent or the standard error is less than 0.05 percent of the mean.

In [None]:
for df in [county2017_df,county2022_df,zipcode2017_df,zipcode2022_df]:
    df['Value'] = df['Value'].str.replace(',', '')
    df['Value'] = pd.to_numeric(df.Value, errors='coerce')
    df['CV_pct'] = pd.to_numeric(df['CV (%)'], errors='coerce')
    for idx, row in df.iterrows():
        if row['CV (%)'] == '(H)':
            df.loc[idx, 'CV_pct'] = 100
        elif row['CV (%)'] == '(L)':
            df.loc[idx, 'CV_pct'] = 0
        # Leave (D) as None, since data was withheld
    df.drop('CV (%)', axis=1, inplace=True)

2022 Economics has 'LABOR: (1 TO 4 HIRED WORKERS)' instead of three separate groups ('LABOR: (1 HIRED WORKERS)', 'LABOR: (2 HIRED WORKERS)', 'LABOR: (3 TO 4 HIRED WORKERS)') in `domaincat_desc`. Sum those three for 2017, and average `CV (%)`.

In [None]:
workers2017 = county2017_df.query("domaincat_desc in ['LABOR: (1 HIRED WORKERS)', 'LABOR: (2 HIRED WORKERS)', 'LABOR: (3 TO 4 HIRED WORKERS)']").groupby('county_code').sum()['Value']
workers2017_cv = county2017_df.query("domaincat_desc in ['LABOR: (1 HIRED WORKERS)', 'LABOR: (2 HIRED WORKERS)', 'LABOR: (3 TO 4 HIRED WORKERS)']").groupby('county_code').mean('CV_pct')['CV_pct']

for idx, row in county2017_df.iterrows():
    if row.domaincat_desc in ['LABOR: (1 HIRED WORKERS)', 'LABOR: (2 HIRED WORKERS)', 'LABOR: (3 TO 4 HIRED WORKERS)']:
        county2017_df.loc[idx, 'domaincat_desc'] = 'LABOR: (1 TO 4 HIRED WORKERS)'
        aggval = workers2017.loc[row.county_code]
        cvval = workers2017_cv.loc[row.county_code]
        
        county2017_df.loc[idx, 'Value'] = aggval
        county2017_df.loc[idx, 'CV_pct'] = cvval


In [244]:
print(len(county2017_df))
county2017_df.drop_duplicates(inplace=True)
print(len(county2017_df))

3895
3784


Identify which variables are not repeated across years.

In [247]:
dfs = [county2017_df,county2022_df,zipcode2017_df,zipcode2022_df]

In [None]:
county2017_vars = set()
county2022_vars = set()
for idx, row in county2017_df.iterrows():
    var = tuple(row[col] for col in group_cols)
    #print(var)
    if var not in county2017_vars:
        county2017_vars.add(var)
    
for idx, row in county2022_df.iterrows():
    var = tuple(row[col] for col in group_cols)
    #print(var)
    if var not in county2022_vars:
        county2022_vars.add(var)

In [313]:
len(county2017_vars ^ county2022_vars)

22

In [None]:
# variable(s) in 2022 county level that aren't in 2017 county level
county2022_vars.difference(county2017_vars)

{('PRODUCERS',
  'PRODUCERS',
  'ALL CLASSES',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED')}

In [None]:
# variable(s) in 2017 county level that aren't in 2022 county level
county2017_vars.difference(county2022_vars)

{('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE 25 TO 34',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE 35 TO 44',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE 45 TO 54',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE 55 TO 64',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE 65 TO 74',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE GE 75',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'),
 ('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'AGE LT 25',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  

In [298]:
county_vars = county2017_vars & county2022_vars

In [311]:
len(county_vars)

43

In [303]:
zipcode2017_vars = set()
zipcode2022_vars = set()
for idx, row in zipcode2017_df.iterrows():
    var = tuple(row[col] for col in group_cols)
    #print(var)
    if var not in zipcode2017_vars:
        zipcode2017_vars.add(var)
    
for idx, row in zipcode2022_df.iterrows():
    var = tuple(row[col] for col in group_cols)
    #print(var)
    if var not in zipcode2022_vars:
        zipcode2022_vars.add(var)

In [314]:
len(zipcode2017_vars ^ zipcode2022_vars)

0

In [308]:
zipcode_vars = zipcode2017_vars & zipcode2022_vars

In [309]:
len(zipcode_vars)

12

In [318]:
for idx, row in county2017_df.iterrows():
    if tuple(row[col] for col in group_cols) == ('PRODUCERS',
  'PRODUCERS',
  'ALL CLASSES',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED'):
        print(idx)

In [None]:
('PRODUCERS',
  'PRODUCERS, PRINCIPAL',
  'ALL CLASSES',
  'ALL PRODUCTION PRACTICES',
  'PRODUCERS',
  'PRODUCERS',
  'TOTAL',
  'NOT SPECIFIED')

In [325]:
group_cols

['group_desc',
 'commodity_desc',
 'class_desc',
 'prodn_practice_desc',
 'statisticcat_desc',
 'unit_desc',
 'domain_desc',
 'domaincat_desc']

In [320]:
county2022_df.commodity_desc.unique()

array(['LABOR', 'AG LAND', 'LAND AREA', 'COMMODITY TOTALS',
       'INCOME, NET CASH FARM', 'FARM OPERATIONS', 'PRODUCERS'],
      dtype=object)

In [334]:
county2017_df.commodity_desc.unique()

array(['LABOR', 'AG LAND', 'LAND AREA', 'COMMODITY TOTALS',
       'INCOME, NET CASH FARM', 'FARM OPERATIONS', 'PRODUCERS',
       'PRODUCERS, PRINCIPAL'], dtype=object)

In [333]:
demog2022_county_df[demog2022_county_df.commodity_desc.str.contains("PRODUCERS")].commodity_desc.unique()

array(['PRODUCERS'], dtype=object)

In [324]:
demog2022_county_df.commodity_desc.str.contains("PRINCIPAL").sum()

np.int64(0)

In [335]:
county2022_df[county2022_df.statisticcat_desc.str.contains('AGE, AVG')][group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
1779,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED


In [336]:
county2017_df[county2017_df.statisticcat_desc.str.contains('AGE, AVG')][group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
1879,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED
3057,PRODUCERS,"PRODUCERS, PRINCIPAL",ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED


In [337]:
county2017_df[county2017_df.commodity_desc.str.contains('PRODUCERS')][group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
1471,PRODUCERS,PRODUCERS,AGE 25 TO 34,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1528,PRODUCERS,PRODUCERS,AGE 35 TO 44,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1589,PRODUCERS,PRODUCERS,AGE 45 TO 54,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1648,PRODUCERS,PRODUCERS,AGE 55 TO 64,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1708,PRODUCERS,PRODUCERS,AGE 65 TO 74,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1767,PRODUCERS,PRODUCERS,AGE GE 75,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1827,PRODUCERS,PRODUCERS,AGE LT 25,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
1879,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"AGE, AVG",YEARS,TOTAL,NOT SPECIFIED
1940,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"YEARS ON ANY OPERATION, AVG",YEARS,TOTAL,NOT SPECIFIED
2001,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,"YEARS ON PRESENT OPERATION, AVG",YEARS,TOTAL,NOT SPECIFIED


In [339]:
len(county2017_df.query("commodity_desc == 'PRODUCERS, PRINCIPAL'")[group_cols].drop_duplicates())

21

In [340]:
len(county2017_df.query("commodity_desc == 'PRODUCERS'")[group_cols].drop_duplicates())

20

In [None]:
# What 'PRODUCERS, PRINCIPAL' variable is in county2017 that isn't repeated in 'PRODUCERS'?
pd.concat([county2017_df.query("commodity_desc == 'PRODUCERS, PRINCIPAL'")[group_cols].drop_duplicates(),county2017_df.query("commodity_desc == 'PRODUCERS'")[group_cols].drop_duplicates()]).drop_duplicates(keep=False, subset=[col for col in group_cols if col != "commodity_desc"])

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
3118,PRODUCERS,"PRODUCERS, PRINCIPAL",ALL CLASSES,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED


In [352]:
county2017_df.query("group_desc == 'PRODUCERS' and statisticcat_desc == 'PRODUCERS' and class_desc == 'ALL CLASSES' and prodn_practice_desc == 'ALL PRODUCTION PRACTICES'")[group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
3118,PRODUCERS,"PRODUCERS, PRINCIPAL",ALL CLASSES,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED


In [347]:
county2022_df.query("group_desc == 'PRODUCERS' and statisticcat_desc == 'PRODUCERS' and class_desc == 'ALL CLASSES' and prodn_practice_desc == 'ALL PRODUCTION PRACTICES'")[group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
1841,PRODUCERS,PRODUCERS,ALL CLASSES,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED


In [356]:
demog2017_county_df[demog2017_county_df.commodity_desc.str.contains("PRODUCER")].commodity_desc.unique()

array(['PRODUCERS', 'PRODUCERS, PRINCIPAL'], dtype=object)

In [355]:
demog2022_county_df[demog2022_county_df.commodity_desc.str.contains("PRODUCER")].commodity_desc.unique()

array(['PRODUCERS'], dtype=object)

In [367]:
demog2017_county_df[demog2017_county_df.short_desc.str.contains("NUMBER OF PRODUCERS")].query("class_desc == '(ALL)' or class_desc == 'ALL CLASSES' and prodn_practice_desc == 'ALL PRODUCTION PRACTICES'")[group_cols + ['Value']].duplicated(subset=["group_desc", "prodn_practice_desc", "statisticcat_desc", "unit_desc", "domain_desc", "Value"])

5505     False
5506     False
5507     False
5508     False
5509     False
         ...  
13861    False
13862    False
13863    False
13864    False
13865    False
Length: 122, dtype: bool

In [362]:
demog2022_county_df[demog2022_county_df.short_desc.str.contains("NUMBER OF PRODUCERS")][group_cols].drop_duplicates()

Unnamed: 0,group_desc,commodity_desc,class_desc,prodn_practice_desc,statisticcat_desc,unit_desc,domain_desc,domaincat_desc
5701,PRODUCERS,PRODUCERS,(ALL),ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5763,PRODUCERS,PRODUCERS,"(ALL), FEMALE",ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5825,PRODUCERS,PRODUCERS,"(ALL), MALE",ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5887,PRODUCERS,PRODUCERS,AGE 25 TO 34,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
5945,PRODUCERS,PRODUCERS,AGE 35 TO 44,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
...,...,...,...,...,...,...,...,...
13023,PRODUCERS,PRODUCERS,"MULTI-RACE, FEMALE",ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
13082,PRODUCERS,PRODUCERS,NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
13092,PRODUCERS,PRODUCERS,"NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER, FEMALE",ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
13219,PRODUCERS,PRODUCERS,WHITE,ALL PRODUCTION PRACTICES,PRODUCERS,PRODUCERS,TOTAL,NOT SPECIFIED
