# Aggregation

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import geopandas
import pysal

In [None]:
df_county = pd.read_csv('ca_county_employment.csv')

In [None]:
df_county.shape

In [None]:
df_county17 = df_county[df_county.year==2017]

In [None]:
df_county17.shape

In [None]:
df_county17.head()

## Defining Regions
Let's do an example where we partition California into two regions. The first region is composed of Riverside and San Bernardino Counties, and the second region is all other counties.

### Aggregation Profile

In [None]:
counties = pd.unique(df_county17.county)
ie = [ "Riverside", "San Bernardino"]
non_ie = [county for county in counties if county not in ie]


In [None]:
non_ie

In [None]:
regions = [ie, non_ie] # our list of region lists

In [None]:
counties = df_county17.county

In [None]:
# Create a dictionary to assign a region id to each county, key is county, value is region id
region_dict = {}
for i,region in enumerate(regions):
    for county in region:
        region_dict[county] = i

In [None]:
region_dict

In [None]:
# now create a new series in the dataframe with these region values
df_county17['region'] = [region_dict[county] for county in df_county17.county]

In [None]:
df_county17.region

In [None]:
region_dict = {}
for i,region in enumerate(regions):
    for county in region:
        region_dict[county] = i

## Group By
We can use the `groupby` method of the dataframe to create a new group by object.

In [None]:
region_gb = df_county17.groupby('region')
region_gb.head()

In [None]:
# then we turn this into a region dataframe with as many records as regions
region_df = region_gb.sum(axis=0) # convert to a dataframe and sum the groups
region_df.head()

## Exercise 3 Part 4: EDD Regions

We will do this using the official data from EDD but pulled out of a pdf.

In [None]:
# taken from first link at https://www.edd.ca.gov/jobs_and_training/pubs/wsd15-17.pdf
edd_regions = """Regional Planning Units

    1. Coastal Region (4 Local Workforce Development Boards [Local Boards]): Monterey, San Luis Obispo, Santa Barbara, Santa Cruz

Counties Included (4): Monterey, Santa Cruz, Santa Barbara, San Luis Obispo 

Major City Populations in Region: Salinas, Santa Maria, Santa Barbara, Monterey, San Luis Obispo, Santa Cruz

    2. Middle Sierra (1 Local Board): Mother Lode

Counties Included (4): Amador, Calaveras, Mariposa, Tuolumne

Major City Populations in Region: Sonora, Angels City

    3. Humboldt (1 Local Board): Humboldt 

Counties Included (1): Humboldt 

Major City Populations in Region: Eureka

    4. North State (1 Local Board): NORTEC

Counties Included (11): Del Norte, Siskiyou, Modoc, Trinity, Shasta, Tehama, Butte, Nevada, Sierra, Plumas, Lassen

Major City Populations in Region: Redding, Chico, Paradise, Oroville, Truckee, Susanville

    5. Capital Region (4 Local Boards): Golden Sierra, North Central Counties Consortium, SETA, Yolo

Counties Included (9): Alpine, Sacramento, Yolo, Sutter, Colusa, Glenn, Yuba, Placer, El Dorado

Major City Populations in Region: Sacramento, Elk Grove, Roseville

    6. East Bay (4 Local Boards): Contra Costa County, Alameda, Richmond, Oakland

Counties Included (2): Contra Costa, Alameda

Major City Populations in Region: Oakland, Fremont, Concord, Berkeley, Richmond, Antioch

    7. North Bay (5 Local Boards): Marin, Napa-Lake, Sonoma, Solano, Mendocino

Counties Included (6): Marin, Napa, Lake, Sonoma, Solano, and Mendocino

Major City Populations in Region: Santa Rosa, Vallejo, Fairfield, San Rafael, Napa, Ukiah

    8. Bay-Peninsula (4 Local Boards): San Francisco, NOVA, San Jose, San Benito

Counties Included (4): San Francisco, San Mateo, Santa Clara, San Benito

Major City Populations in Region: San Jose, San Francisco, Sunnyvale, Santa Clara, Daly City, San Mateo, Palo Alto

    9. San Joaquin Valley and Associated Counties (8 Local Boards): Fresno, Kern-Inyo-Mono,  Kings, Madera, Merced, San Joaquin, Stanislaus, Tulare

Counties Included (10): Fresno, Kern, Inyo, Mono, Kings, Madera, Merced, San Joaquin, Stanislaus, Tulare

Major City Populations in Region: Fresno, Bakersfield, Stockton, Modesto, Visalia, Clovis, Merced	

    10. Southern Border (2 Local Boards): San Diego, Imperial

Counties Included (2): San Diego, Imperial

Major City Populations in Region: San Diego, Chula Vista, Oceanside, Escondido, Carlsbad, El Cajon

    11. Los Angeles Basin (7 Local Boards): Los Angeles City, Los Angeles County, Foothill, Southeast Los Angeles County, South Bay, Verdugo, Pacific Gateway 

Counties Included (1): Los Angeles 

Major City Populations in Region: Los Angeles, Long Beach, Santa Clarita, Glendale, Lancaster, Palmdale, Pomona, Torrance, Pasadena, El Monte, Downey, Inglewood, West Covina, Norwalk, Burbank, Carson, Compton, Santa Monica

    12. Orange (3 Local Boards): Santa Ana, Orange, Anaheim

Counties Included (1): Orange

Major City Populations in Region: Anaheim, Santa Ana, Irvine, Huntington Beach, Garden Grove, Orange, Fullerton, Costa Mesa, Mission Viejo

    13. Inland Empire (3 Local Boards): Riverside, San Bernardino County, San Bernardino City

Counties Included (2): Riverside, San Bernardino

Major City Populations in Region: Riverside, San Bernardino, Fontana, Moreno Valley, Rancho Cucamonga, Ontario, Corona, Victorville, Murrieta, Temecula, Rialto

    14. Ventura (1 Local Board): Ventura

Counties Included (1): Ventura

Major City Populations in Region: Oxnard, Thousand Oaks, Simi Valley, San Buenaventura
"""

In [None]:
edd_regions

In [None]:
regions = {}
edd_lines = edd_regions.split("\n\n")
for line in edd_lines:
    #print(line)
    if "Counties Included" in line:
        print(line)

In [None]:
regions = []
edd_lines = edd_regions.split("\n\n")
region=0
for line in edd_lines:
    #print(line)
    if "Counties Included" in line:
        print(line)
        left, right = line.split(":")
        if ' and ' in right:
            right = right.replace(" and ", "")
        counties = right.split(",")
        
        regions.append(counties)
        
        

In [None]:
regions

In [None]:
n = 0
for region in regions:
    n += len(region)

In [None]:
n

In [None]:
region_dict = {}
for i,region in enumerate(regions):
    for county in region:
        county = county.strip() # get rid of leading white space
        
        region_dict[county] = i

In [None]:
region_dict

In [None]:
df_county17['region'] = [region_dict[county] for county in df_county17.county]

In [None]:
df_county17.region

In [None]:
s = ' San Bernardino'
s.strip()

In [None]:
region_gb = df_county17.groupby('region')
region_gb.head()

region_df = region_gb.sum(axis=0) # convert to a datafram



In [None]:
region_df[['county', 'Military']].sort_values(by='Military')

In [None]:
region_df[['county', 'Arts']].sort_values(by='Arts',ascending=False)

In [None]:
region_df[['county', 'State']].sort_values(by='State',ascending=False)

## Mapping Aggregation 

In [None]:
odf = geopandas.read_file('azca.shp')

In [None]:
odf.head()

In [None]:
ca_gdf = odf[odf.STATEFP=='06']

In [None]:
ca_gdf.plot()

In [None]:
ca_gdf.shape

In [None]:
df_county17.head()

In [None]:
ca_gdf.merge(df_county17, left_on='NAME', right_on='county')

In [None]:
ca_gdf = ca_gdf.merge(df_county17, left_on='NAME', right_on='county')

In [None]:
ca_gdf.plot(column='region', categorical=True)

In [None]:
ca_gdf.plot(column='region', categorical=True, legend=True)

In [None]:
regions_df = ca_gdf.dissolve(by='region')


In [None]:
regions_df.plot()

In [None]:
regions_df.shape

In [None]:
regions_df.columns

In [None]:
regions_df.index

In [None]:
regions_df['region'] = regions_df.index.values

In [None]:
regions_df.plot(column='region', categorical='True')

In [None]:
regions_df.plot(column='region', categorical=True, legend=True, cmap='inferno')

In [None]:
f, ax = plt.subplots(1, figsize=(12, 12))
ax =regions_df.plot(column='region', categorical=True, legend=True, 
                cmap='tab20b', legend_kwds={'loc': 'lower left'}, ax=ax)

ax.set_axis_off()
plt.title('California EDD Regions')
plt.show()



In [None]:
f, ax = plt.subplots(1, figsize=(12, 12))
ax =regions_df.plot(column='region', categorical=True, legend=True, 
                cmap='tab20b', legend_kwds={'loc': 'lower left'}, ax=ax)

ca_gdf.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 2,ax=ax)

ax.set_axis_off()
plt.title('California EDD Regions')
plt.show()



In [None]:
f, ax = plt.subplots(1, figsize=(12, 12))
ax =regions_df.plot(column='region', categorical=True, legend=True, 
                cmap='tab20b', legend_kwds={'loc': 'lower left'}, ax=ax)

ca_gdf.geometry.boundary.plot(color=None,edgecolor='k',linewidth = 1,ax=ax)
regions_df.geometry.boundary.plot(color=None, edgecolor='k', linewidth=4, ax=ax)
ax.set_axis_off()
plt.title('California EDD Regions')
plt.show()

