In [1]:
import os
import matplotlib.pyplot as plot
import geopandas as gpd
import earthpy as et
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.options.display.max_rows = 4000

In [2]:
demographics = pd.read_csv("../datasets/ACSDP1Y2018.DP05_data_with_overlays_2020-09-15T163409.csv", encoding="latin1")
shapefile = gpd.read_file("../datasets/Shapefiles/tl_2018_us_cd116.shp")
vote_counts = pd.read_csv("../datasets/1976-2018-house2.csv", encoding="latin1")

In [3]:
demographics.head()

Unnamed: 0,GEO_ID,NAME,DP05_0001E,DP05_0001M,DP05_0001PE,DP05_0001PM,DP05_0002E,DP05_0002M,DP05_0002PE,DP05_0002PM,...,DP05_0087PE,DP05_0087PM,DP05_0088E,DP05_0088M,DP05_0088PE,DP05_0088PM,DP05_0089E,DP05_0089M,DP05_0089PE,DP05_0089PM
0,id,Geographic Area Name,Estimate!!SEX AND AGE!!Total population,Margin of Error!!SEX AND AGE!!Total population,Percent Estimate!!SEX AND AGE!!Total population,Percent Margin of Error!!SEX AND AGE!!Total po...,Estimate!!SEX AND AGE!!Total population!!Male,Margin of Error!!SEX AND AGE!!Total population...,Percent Estimate!!SEX AND AGE!!Total populatio...,Percent Margin of Error!!SEX AND AGE!!Total po...,...,"Percent Estimate!!CITIZEN, VOTING AGE POPULATI...","Percent Margin of Error!!CITIZEN, VOTING AGE P...","Estimate!!CITIZEN, VOTING AGE POPULATION!!Citi...","Margin of Error!!CITIZEN, VOTING AGE POPULATIO...","Percent Estimate!!CITIZEN, VOTING AGE POPULATI...","Percent Margin of Error!!CITIZEN, VOTING AGE P...","Estimate!!CITIZEN, VOTING AGE POPULATION!!Citi...","Margin of Error!!CITIZEN, VOTING AGE POPULATIO...","Percent Estimate!!CITIZEN, VOTING AGE POPULATI...","Percent Margin of Error!!CITIZEN, VOTING AGE P..."
1,5001600US0107,"Congressional District 7 (116th Congress), Ala...",660468,11651,660468,(X),310422,6656,47.0,0.7,...,504177,(X),230516,5232,45.7,0.6,273661,5749,54.3,0.6
2,5001600US0605,"Congressional District 5 (116th Congress), Cal...",730955,8907,730955,(X),359113,6509,49.1,0.6,...,508369,(X),242276,6539,47.7,0.7,266093,5746,52.3,0.7
3,5001600US0608,"Congressional District 8 (116th Congress), Cal...",717107,10856,717107,(X),361408,7256,50.4,0.8,...,477512,(X),240107,6199,50.3,0.9,237405,7155,49.7,0.9
4,5001600US0618,"Congressional District 18 (116th Congress), Ca...",750295,13343,750295,(X),375370,8630,50.0,0.7,...,491580,(X),239626,5593,48.7,0.6,251954,5864,51.3,0.6


In [4]:
# Subset the demographics dataframe
# By column name.
# We want to drop the M, PE, and PM values for each measurement
# and only keep the estimates.
deregexed = demographics.filter(axis=1, regex="^DP05_([0-9]*)E|GEO_ID|NAME$")

In [5]:
# Renaming indices by removing 
newIndices = deregexed.columns
def renameDemographicIndices(s):
    splitString = s.split("!!")
    splitString.reverse()
    splitString = splitString[0:-1]
    return "!!".join(splitString)
newColumns = [renameDemographicIndices(s) for s in newIndices]

# Just set the first two manually
newColumns[0] = "id"
newColumns[1] = "district"

In [6]:
deregexed.columns = newColumns
deregexed = deregexed.drop(labels=0)

In [7]:
# Dataframe with new labels
deregexed.head()

Unnamed: 0,id,district,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
1,5001600US0107,"Congressional District 7 (116th Congress), Ala...",660468,310422,350046,88.7,40762,41509,37440,53598,...,4988,206,380,8454,428,8026,332982,504177,230516,273661
2,5001600US0605,"Congressional District 5 (116th Congress), Cal...",730955,359113,371842,96.6,40865,38662,46157,42430,...,81035,2489,3083,26731,882,25849,286652,508369,242276,266093
3,5001600US0608,"Congressional District 8 (116th Congress), Cal...",717107,361408,355699,101.6,53385,58479,51137,47916,...,22060,2013,1114,20847,177,20670,305372,477512,240107,237405
4,5001600US0618,"Congressional District 18 (116th Congress), Ca...",750295,375370,374925,100.1,39537,45103,52495,43666,...,180316,427,2443,28848,1181,27667,290127,491580,239626,251954
5,5001600US0622,"Congressional District 22 (116th Congress), Ca...",771095,377846,393249,96.1,60473,62877,61579,58322,...,60785,659,2486,17473,1108,16365,257742,483064,228078,254986


In [8]:
# We want to have state IDs as a column. To do this, use district strings.
def extractStateNames(s):
    splitString = s.split(",")
    return splitString[-1].strip()


In [9]:
stateNames = [extractStateNames(s) for s in deregexed["district"]]
deregexed["state"] = stateNames

In [10]:
# To join this dataframe with the shapefile one,
# we need to join on state FIPS codes.
# Use a file to to do this.
state_dictionary = {}
with open("statefipscodes.txt") as fips:
    for line in fips:
        line = line.split(" ")
        state_dictionary[" ".join(line[:-1])] = line[-1].replace("\n", "").replace("\t", "")



In [11]:
def get_state_code(state_name):
    return state_dictionary[state_name]

deregexed["STATEFP"] = deregexed["state"].apply(get_state_code)
deregexed.head()

Unnamed: 0,id,district,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,state,STATEFP
1,5001600US0107,"Congressional District 7 (116th Congress), Ala...",660468,310422,350046,88.7,40762,41509,37440,53598,...,380,8454,428,8026,332982,504177,230516,273661,Alabama,1
2,5001600US0605,"Congressional District 5 (116th Congress), Cal...",730955,359113,371842,96.6,40865,38662,46157,42430,...,3083,26731,882,25849,286652,508369,242276,266093,California,6
3,5001600US0608,"Congressional District 8 (116th Congress), Cal...",717107,361408,355699,101.6,53385,58479,51137,47916,...,1114,20847,177,20670,305372,477512,240107,237405,California,6
4,5001600US0618,"Congressional District 18 (116th Congress), Ca...",750295,375370,374925,100.1,39537,45103,52495,43666,...,2443,28848,1181,27667,290127,491580,239626,251954,California,6
5,5001600US0622,"Congressional District 22 (116th Congress), Ca...",771095,377846,393249,96.1,60473,62877,61579,58322,...,2486,17473,1108,16365,257742,483064,228078,254986,California,6


In [12]:
# Strip state information from district names
def set_district_names(s):
    s = s.replace("(116th Congress)", "")
    split = s.split(",")
    return split[0].strip()

def set_district_num(s):
    split = s.split(" ")
    if s[-1] == ")":
        return 0
    return int(split[-1])
    


In [13]:
deregexed["district"] = deregexed["district"].apply(set_district_names)
deregexed["CD116FP"] = deregexed["district"].apply(set_district_num)

In [14]:
deregexed["STATEFP"] = deregexed["STATEFP"].apply(int)

In [15]:
shapefile["STATEFP"] = shapefile["STATEFP"].apply(int)

In [16]:
shapefile_filtered = shapefile[shapefile["CD116FP"] != "ZZ"]

In [17]:
shapefile_filtered["STATEFP"] = shapefile_filtered["STATEFP"].astype(int)
shapefile_filtered["CD116FP"] = shapefile_filtered["CD116FP"].astype(int)
deregexed["STATEFP"] = deregexed["STATEFP"].astype(int)
deregexed["CD116FP"] = deregexed["CD116FP"].astype(int)

joined_demo_shape = pd.merge(
     shapefile_filtered,
     deregexed,
     how="left",
     left_on=["STATEFP", "CD116FP"],
     right_on=["STATEFP", "CD116FP"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [21]:
vote_counts_district = pd.read_csv("../datasets/vote_counts_district.csv")

In [22]:
vote_counts_district.head()

Unnamed: 0,state,district,dem_votes,rep_votes,winner,wasted_republican,wasted_democrat
0,Alabama,1,89226,153228,r,76614.0,89226.0
1,Alabama,2,86931,138879,r,69439.5,86931.0
2,Alabama,3,83996,147770,r,73885.0,83996.0
3,Alabama,4,46492,184255,r,92127.5,46492.0
4,Alabama,5,101388,159063,r,79531.5,101388.0


In [23]:
# Rename district number to match previously joined dataset
vote_counts_district = vote_counts_district.rename(columns={"district": "CD116FP"})

In [24]:
vote_counts_districts_demos = pd.merge(
     joined_demo_shape,
     vote_counts_district,
     how="left",
     left_on=["state", "CD116FP"],
     right_on=["state", "CD116FP"]
)

In [25]:
vote_counts_districts_demos

Unnamed: 0,STATEFP,CD116FP,GEOID,NAMELSAD,LSAD,CDSESSN,MTFCC,FUNCSTAT,ALAND,AWATER,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,state,dem_votes,rep_votes,winner,wasted_republican,wasted_democrat
0,27,7,2707,Congressional District 7,C2,116,G5200,N,86580712408,5058381039,...,332937.0,495727.0,245413.0,250314.0,Minnesota,,,,,
1,27,2,2702,Congressional District 2,C2,116,G5200,N,6314464923,243358361,...,276721.0,511914.0,247867.0,264047.0,Minnesota,,,,,
2,27,6,2706,Congressional District 6,C2,116,G5200,N,7460634590,408723230,...,273868.0,525551.0,264594.0,260957.0,Minnesota,,,,,
3,27,8,2708,Congressional District 8,C2,116,G5200,N,72281499178,12448503736,...,380184.0,527541.0,266984.0,260557.0,Minnesota,,,,,
4,17,1,1701,Congressional District 1,C2,116,G5200,N,669256000,2067510,...,305477.0,524114.0,239139.0,284975.0,Illinois,189560.0,50960.0,d,50960.0,94780.0
5,17,18,1718,Congressional District 18,C2,116,G5200,N,27235396834,383133624,...,313677.0,531334.0,256746.0,274588.0,Illinois,95486.0,195927.0,r,97963.5,95486.0
6,17,7,1707,Congressional District 7,C2,116,G5200,N,161458448,5341167,...,353542.0,522767.0,245563.0,277204.0,Illinois,215746.0,30497.0,d,30497.0,107873.0
7,17,2,1702,Congressional District 2,C2,116,G5200,N,2798736458,25389182,...,295933.0,497132.0,220603.0,276529.0,Illinois,190684.0,44567.0,d,44567.0,95342.0
8,31,1,3101,Congressional District 1,C2,116,G5200,N,22995959831,249898247,...,277762.0,473676.0,236555.0,237121.0,Nebraska,93069.0,141712.0,r,70856.0,93069.0
9,6,36,636,Congressional District 36,C2,116,G5200,N,15312870769,201717828,...,382465.0,536333.0,262414.0,273919.0,California,122169.0,84839.0,d,84839.0,61084.5


In [26]:
vote_counts_districts_demos[vote_counts_districts_demos["state"] == "Louisiana"]

Unnamed: 0,STATEFP,CD116FP,GEOID,NAMELSAD,LSAD,CDSESSN,MTFCC,FUNCSTAT,ALAND,AWATER,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,state,dem_votes,rep_votes,winner,wasted_republican,wasted_democrat
38,22,4,2204,Congressional District 4,C2,116,G5200,N,32210761600,1106343072,...,354195,559310,272704,286606,Louisiana,72934.0,139326.0,r,69663.0,72934.0
39,22,2,2202,Congressional District 2,C2,116,G5200,N,3285174126,522407749,...,354621,587115,273178,313937,Louisiana,,,,,
40,22,6,2206,Congressional District 6,C2,116,G5200,N,10447504074,1012577737,...,342644,585015,276949,308066,Louisiana,76716.0,186553.0,r,93276.5,76716.0
103,22,1,2201,Congressional District 1,C2,116,G5200,N,10436304503,14398936017,...,343004,594231,282992,311239,Louisiana,71521.0,192555.0,r,96277.5,71521.0
104,22,3,2203,Congressional District 3,C2,116,G5200,N,18087979520,5594833099,...,346380,577604,275886,301718,Louisiana,74713.0,168263.0,r,84131.5,74713.0
105,22,5,2205,Congressional District 5,C2,116,G5200,N,37429870551,1118524221,...,335292,555804,274214,281590,Louisiana,67118.0,149018.0,r,74509.0,67118.0


In [27]:
vote_counts_districts_demos.to_csv("shapes_demographics_votes_10-4.csv")