In [1]:
# Import dependencies pandas,
    # requests, gmaps, census, and finally config's census_key and google_key
import pandas as pd
import requests
import gmaps
from census import Census

from config import (census_key, google_key)


In [2]:
# Declare a variable "c" and set it to the census with census_key.
    # https://github.com/datamade/census
    # We're going to use the default year 2016, however feel free to use another year.
c = Census(census_key, year=2016)

In [3]:
# Run a census search to retrieve data on estimate of male, female, population, and unemployment count for each zip code.
    # https://api.census.gov/data/2013/acs5/variables.html
census_data = c.acs5.get(("B01001_002E", "B01001_026E", "B01003_001E", "B23025_005E"), {'for': 'zip code tabulation area:*'})

# Show the output of census_data
census_data


[{'B01001_002E': 8059.0,
  'B01001_026E': 9364.0,
  'B01003_001E': 17423.0,
  'B23025_005E': 479.0,
  'zip code tabulation area': '01001'},
 {'B01001_002E': 14536.0,
  'B01001_026E': 15434.0,
  'B01003_001E': 29970.0,
  'B23025_005E': 1271.0,
  'zip code tabulation area': '01002'},
 {'B01001_002E': 5694.0,
  'B01001_026E': 5602.0,
  'B01003_001E': 11296.0,
  'B23025_005E': 1021.0,
  'zip code tabulation area': '01003'},
 {'B01001_002E': 2798.0,
  'B01001_026E': 2430.0,
  'B01003_001E': 5228.0,
  'B23025_005E': 153.0,
  'zip code tabulation area': '01005'},
 {'B01001_002E': 7224.0,
  'B01001_026E': 7664.0,
  'B01003_001E': 14888.0,
  'B23025_005E': 543.0,
  'zip code tabulation area': '01007'},
 {'B01001_002E': 632.0,
  'B01001_026E': 562.0,
  'B01003_001E': 1194.0,
  'B23025_005E': 63.0,
  'zip code tabulation area': '01008'},
 {'B01001_002E': 116.0,
  'B01001_026E': 121.0,
  'B01003_001E': 237.0,
  'B23025_005E': 0.0,
  'zip code tabulation area': '01009'},
 {'B01001_002E': 1901.0,
  

In [4]:
# Create a variable census_pd and set it to a dataframe made with the census_data's list of dictionaries
census_pd = pd.DataFrame(census_data)

# Rename census_pd with appropriate columns "Male", "Female", "Population", "Unemployment Count", and "Zipcode"
census_pd = census_pd.rename(columns = {"B01001_002E": "Male", 
                                        "B01001_026E": "Female", 
                                        "B01003_001E" : "Population",
                                        "B23025_005E" : "Unemployment Count",
                                        "zip code tabulation area": "Zipcode"})

# Show the first 5 rows of census_pd
census_pd.head()

Unnamed: 0,Male,Female,Population,Unemployment Count,Zipcode
0,8059.0,9364.0,17423.0,479.0,1001
1,14536.0,15434.0,29970.0,1271.0,1002
2,5694.0,5602.0,11296.0,1021.0,1003
3,2798.0,2430.0,5228.0,153.0,1005
4,7224.0,7664.0,14888.0,543.0,1007


In [5]:
# Create a new variable calc_census_pd and set it to census_pd
calc_census_pd = census_pd

# Calculate the % of male to female ratio and add them as new columns Male % and Female %.
calc_census_pd['Male %'] = calc_census_pd['Male']/(calc_census_pd['Male']+calc_census_pd['Female'])
calc_census_pd['Female %'] = calc_census_pd['Female']/(calc_census_pd['Male']+calc_census_pd['Female'])

# Calculate the unemployment rate based on population
calc_census_pd['Unemp Rate'] = calc_census_pd['Unemployment Count']/calc_census_pd['Population']

# Show the first 5 rows of calc_census_pd
calc_census_pd.head()

Unnamed: 0,Male,Female,Population,Unemployment Count,Zipcode,Male %,Female %,Unemp Rate
0,8059.0,9364.0,17423.0,479.0,1001,0.46255,0.53745,0.027492
1,14536.0,15434.0,29970.0,1271.0,1002,0.485018,0.514982,0.042409
2,5694.0,5602.0,11296.0,1021.0,1003,0.504072,0.495928,0.090386
3,2798.0,2430.0,5228.0,153.0,1005,0.535195,0.464805,0.029265
4,7224.0,7664.0,14888.0,543.0,1007,0.485223,0.514777,0.036472


In [6]:
# Get the correlation coefficients of calc_census_pd
calc_census_pd.corr()

Unnamed: 0,Male,Female,Population,Unemployment Count,Male %,Female %,Unemp Rate
Male,1.0,0.994607,0.99859,0.899766,-0.086288,0.086288,0.073789
Female,0.994607,1.0,0.998711,0.907239,-0.119554,0.119554,0.07919
Population,0.99859,0.998711,1.0,0.904806,-0.103433,0.103433,0.076654
Unemployment Count,0.899766,0.907239,0.904806,1.0,-0.103998,0.103998,0.199782
Male %,-0.086288,-0.119554,-0.103433,-0.103998,1.0,-1.0,-0.033701
Female %,0.086288,0.119554,0.103433,0.103998,-1.0,1.0,0.033701
Unemp Rate,0.073789,0.07919,0.076654,0.199782,-0.033701,0.033701,1.0


### Critical Thinking: From the above correlation table. What does the unemployment rate tell you about its correlation with the number of males or females?

#### ANSWER: 
The number of male or females in a population has little to no effect on unemployment rate.

In [8]:
# Use the describe function to get a quick glance at calc_census_pd.
calc_census_pd.describe()

Unnamed: 0,Male,Female,Population,Unemployment Count,Male %,Female %,Unemp Rate
count,33120.0,33120.0,33120.0,33120.0,32799.0,32799.0,32799.0
mean,4783.803442,4940.605857,9724.4093,363.300876,0.502147,0.497853,0.033839
std,7028.455682,7349.589801,14358.657599,635.292863,0.067328,0.067328,0.03133
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,365.0,349.0,718.0,17.0,0.477735,0.481384,0.017982
50%,1407.0,1384.5,2807.5,86.0,0.495817,0.504183,0.029832
75%,6528.25,6717.25,13177.75,428.0,0.518616,0.522265,0.043116
max,60490.0,59237.0,115104.0,9698.0,1.0,1.0,1.0


### Do you see anything strange about male % or female % in the describe above?

#### ANSWER: 
Even though the average male and female per zip code are similiar and the 25/50/75 percentiles suggest fairly equal distribution per zipcode, there are some zipcodes with no males or females, and others with 100% male or female population.

In [10]:
# Create two variables called "male_zipcode_outliers" and "female_zipcode_outliers"
    # Set them to queries where male or female is are the outliers based on the described data from previous task.
    # Example: anything greater than 0.95 is an outlier
male_zipcode_outliers = calc_census_pd[calc_census_pd['Male %'] > 0.95]
female_zipcode_outliers = calc_census_pd[calc_census_pd['Female %'] > 0.95]

# Show all rows for either "male_zipcode_outliers" and "female_zipcode_outliers"
male_zipcode_outliers

Unnamed: 0,Male,Female,Population,Unemployment Count,Zipcode,Male %,Female %,Unemp Rate
36,12.0,0.0,12.0,0.0,01066,1.000000,0.000000,0.000000
405,192.0,0.0,192.0,0.0,02366,1.000000,0.000000,0.000000
1256,51.0,0.0,51.0,0.0,04944,1.000000,0.000000,0.000000
1580,98.0,0.0,98.0,0.0,06061,1.000000,0.000000,0.000000
1817,6.0,0.0,6.0,0.0,06856,1.000000,0.000000,0.000000
2069,42.0,0.0,42.0,0.0,07820,1.000000,0.000000,0.000000
2125,236.0,0.0,236.0,22.0,07939,1.000000,0.000000,0.093220
2271,1275.0,0.0,1275.0,0.0,08320,1.000000,0.000000,0.000000
2277,3519.0,119.0,3638.0,8.0,08327,0.967290,0.032710,0.002199
2566,131.0,0.0,131.0,0.0,10545,1.000000,0.000000,0.000000


### What is a possible cause of some outliers with larger populations?

Hint: Look up the zipcode for larger population of either male or female outliers. 
What information do these zipcodes have in common?

### ANSWER: 
Some of these are correctional facilities.

# Heatmap of population

In [12]:
# Create a variable "zip_lng_lat_data" and using pandas import the zip_codes_states.csv from Resources folder.
    # https://www.gaslampmedia.com/download-zip-code-latitude-longitude-city-state-county-csv/
    
    # HINT: When loading zipcodes they may turn into integers and lose their 0's. 
    # To correct this check out dtype in the documentation:
        # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
zip_lng_lat_data = pd.read_csv('Resources/zip_codes_states.csv', dtype='str')

# Show the first 5 rows of zip_lng_lat_data
zip_lng_lat_data.head()

Unnamed: 0,zip_code,latitude,longitude,city,state,county
0,501,40.922326,-72.637078,Holtsville,NY,Suffolk
1,544,40.922326,-72.637078,Holtsville,NY,Suffolk
2,601,18.165273,-66.722583,Adjuntas,PR,Adjuntas
3,602,18.393103,-67.180953,Aguada,PR,Aguada
4,603,18.455913,-67.14578,Aguadilla,PR,Aguadilla


In [13]:
# Get the longitude and latitude based calc_census_pd by merging them on their zip code columns.
merged_table = pd.merge(zip_lng_lat_data, calc_census_pd, how='right', left_on='zip_code', right_on='Zipcode')

# Show the first 5 rows of merged_table
merged_table.head()

Unnamed: 0,zip_code,latitude,longitude,city,state,county,Male,Female,Population,Unemployment Count,Zipcode,Male %,Female %,Unemp Rate
0,601,18.165273,-66.722583,Adjuntas,PR,Adjuntas,8704.0,9096.0,17800.0,2152.0,601,0.488989,0.511011,0.120899
1,602,18.393103,-67.180953,Aguada,PR,Aguada,19509.0,20207.0,39716.0,3116.0,602,0.491213,0.508787,0.078457
2,603,18.455913,-67.14578,Aguadilla,PR,Aguadilla,25208.0,26357.0,51565.0,3768.0,603,0.488859,0.511141,0.073073
3,606,18.172947,-66.944111,Maricao,PR,Maricao,3137.0,3183.0,6320.0,205.0,606,0.496361,0.503639,0.032437
4,610,18.288685,-67.139696,Anasco,PR,Anasco,13528.0,14448.0,27976.0,1587.0,610,0.483557,0.516443,0.056727


In [14]:
# Configure gmaps with API key
gmaps.configure(api_key=google_key)

In [15]:
# Define locations as a dataframe of latitude and longitude from merged_table.
    # HINT: You'll need to drop the NaN before storing into locations or population
locations = merged_table.dropna(axis="rows", how="any")[["latitude", "longitude"]].astype(float)

# Define population as the population from merged_table
    # HINT: You'll need to drop the NaN before storing into locations or population
population = merged_table.dropna(axis="rows", how="any")['Population'].astype(float)


In [16]:
# Create a population Heatmap layer
    # Note you may need to run the following in your terminal to show the gmaps figure.
        # jupyter nbextension enable --py --sys-prefix widgetsnbextension
        # jupyter nbextension enable --py --sys-prefix gmaps
fig = gmaps.figure()

# Recommended settings for heatmap layer: max_intensity=2000000 and point radius = 1
heat_layer = gmaps.heatmap_layer(locations, 
                                 weights=population, dissipating=False, 
                                 max_intensity=2000000, point_radius=1)

# Adjust heat_layer setting to help with heatmap dissipating on zoom
heat_layer.dissipating = False
heat_layer.max_intensity = 2000000
heat_layer.point_radius = 1
fig.add_layer(heat_layer)

fig



Figure(layout=FigureLayout(height='420px'))

### What is a downfall of using zip codes for mapping?

### ANSWER: 
https://www.census.gov/geo/reference/zctas.html
ZIP Code Tabulation Areas (ZCTAs) are generalized areal representations of United States Postal Service (USPS) ZIP Code service areas.

The USPS ZIP Codes identify the individual post office or metropolitan area delivery station associated with mailing addresses. USPS ZIP Codes are not areal features but a collection of mail delivery routes.

