In [1]:
!pip install census



In [2]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from config import api_key
#c = Census(api_key, year=2016)

In [3]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
def get_census(year):
    c = Census(api_key, year=year)
    census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E",
                          "B17001_002E"), 
                         {'for': 'county:*'})
                        # {'for': 'zip code tabulation area:*'})

# Convert to DataFrame
    census_pd = pd.DataFrame(census_data)

# Column Reordering
    census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "NAME": "Name", "county": "County"
                                      })

# Add in Poverty Rate (Poverty Count / Population)
    census_pd["Poverty Rate"] = 100 * \
        census_pd["Poverty Count"].astype(
            int) / census_pd["Population"].astype(int)

# Final DataFrame
    census_pd = census_pd[["Name","County", "Population", "Median Age", "Household Income",
                           "Per Capita Income", "Poverty Count", "Poverty Rate"]]

    census_pd['county_name'] = census_pd['Name'].str.replace(r" County,(.*)",'').str.upper()
    census_pd['state'] = census_pd['Name'].str.replace(r"(.*) County, ",'').str.upper()
# Visualize
    
    return census_pd

census_2016 = get_census(2016)
census_2016.head()
census_2016['year'] = 2016

census_2020 = get_census(2020)
census_2020['year'] = 2020
census_2020.head()
census = pd.concat([census_2016,census_2020])




In [106]:
census.dtypes

Name                  object
County                object
Population           float64
Median Age           float64
Household Income     float64
Per Capita Income    float64
Poverty Count        float64
Poverty Rate         float64
county_name           object
state                 object
year                   int64
dtype: object

In [110]:
census = census[census['Household Income']>0]
census

Unnamed: 0,Name,County,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,county_name,state,year
0,"Wright County, Missouri",229,18378.0,41.9,30581.0,18502.0,4560.0,24.812276,WRIGHT,MISSOURI,2016
1,"Clay County, Missouri",047,233135.0,36.8,63702.0,30531.0,20867.0,8.950608,CLAY,MISSOURI,2016
2,"Dent County, Missouri",065,15578.0,43.1,38020.0,19929.0,3426.0,21.992554,DENT,MISSOURI,2016
3,"Saline County, Missouri",195,23214.0,37.4,40645.0,20446.0,3748.0,16.145429,SALINE,MISSOURI,2016
4,"Worth County, Missouri",227,2061.0,47.9,44974.0,23292.0,262.0,12.712276,WORTH,MISSOURI,2016
...,...,...,...,...,...,...,...,...,...,...,...
3216,"Renville County, Minnesota",129,14572.0,44.0,58542.0,31243.0,1373.0,9.422180,RENVILLE,MINNESOTA,2020
3217,"Roseau County, Minnesota",135,15259.0,41.6,62304.0,31452.0,1133.0,7.425126,ROSEAU,MINNESOTA,2020
3218,"Sherburne County, Minnesota",141,96015.0,36.1,88671.0,36022.0,4953.0,5.158569,SHERBURNE,MINNESOTA,2020
3219,"Steele County, Minnesota",147,36710.0,39.2,68172.0,34648.0,2887.0,7.864342,STEELE,MINNESOTA,2020


In [111]:
census.to_csv("clean_census.csv", encoding="utf-8", index=False)

In [112]:
# # Save as a csv
# # Note to avoid any issues later, use encoding="utf-8"
winners_pd=pd.read_csv("countypres_2000-2020_with_winner.csv")
# winners_pd = winners_pd[((winners_pd['year'] == 2016) | (winners_pd['year'] == 2020)) & (winners_pd['state']== 'GEORGIA') & (winners_pd['county_name']== "WORTH")]
winners_pd

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode,Concat,Winner
0,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,AL GORE,DEMOCRAT,4942,17208,20220315,TOTAL,2000ALABAMAAUTAUGA,Republican
1,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,11993,17208,20220315,TOTAL,2000ALABAMAAUTAUGA,Republican
2,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,RALPH NADER,GREEN,160,17208,20220315,TOTAL,2000ALABAMAAUTAUGA,Republican
3,2000,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,OTHER,OTHER,113,17208,20220315,TOTAL,2000ALABAMAAUTAUGA,Republican
4,2000,ALABAMA,AL,BALDWIN,1003.0,US PRESIDENT,AL GORE,DEMOCRAT,13997,56480,20220315,TOTAL,2000ALABAMABALDWIN,Republican
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72612,2020,WYOMING,WY,WASHAKIE,56043.0,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,3245,4032,20220315,TOTAL,2020WYOMINGWASHAKIE,Republican
72613,2020,WYOMING,WY,WESTON,56045.0,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,360,3560,20220315,TOTAL,2020WYOMINGWESTON,Republican
72614,2020,WYOMING,WY,WESTON,56045.0,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,46,3560,20220315,TOTAL,2020WYOMINGWESTON,Republican
72615,2020,WYOMING,WY,WESTON,56045.0,US PRESIDENT,OTHER,OTHER,47,3560,20220315,TOTAL,2020WYOMINGWESTON,Republican


In [35]:
# dem_pd = winners_pd[winners_pd['party'] == 'DEMOCRAT']
# dem_pd
#  #& ((winners_pd['party'] == 'DEMOCRAT') | (winners_pd['party'] == 'REPUBLICAN') )


In [36]:
# rep_pd = winners_pd[winners_pd['party'] == 'REPUBLICAN']
# rep_pd


In [37]:
# merge_pd=dem_pd.merge(rep_pd, on = ['year','state', 'county_name'])
# merge_pd.head(100)

In [46]:
# merge_pd['winner'] = np.where(merge_pd['candidatevotes_x'] > merge_pd['candidatevotes_y'], merge_pd['party_x'], merge_pd['party_y'])
# merge_pd = merge_pd[['year','state','county_name','winner']]
# merge_pd

In [113]:
#Dave's code
winners_pd = winners_pd[((winners_pd['year'] == 2016) | (winners_pd['year'] == 2020))]
winners_pd = winners_pd[((winners_pd['party'] == 'DEMOCRAT') | (winners_pd['party'] == 'REPUBLICAN'))]
#winners_pd = winners_pd[((winners_pd['state']== 'GEORGIA') & (winners_pd['county_name']== "WORTH"))]
#winners_pd.head(100)
winners_pd=winners_pd.drop_duplicates(subset=['state_po','county_name', 'year'],keep='first')

In [114]:
winners_pd

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode,Concat,Winner
40517,2016,ALABAMA,AL,AUTAUGA,1001.0,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,5936,24973,20220315,TOTAL,2016ALABAMAAUTAUGA,Republican
40520,2016,ALABAMA,AL,BALDWIN,1003.0,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,18458,95215,20220315,TOTAL,2016ALABAMABALDWIN,Republican
40523,2016,ALABAMA,AL,BARBOUR,1005.0,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,4871,10469,20220315,TOTAL,2016ALABAMABARBOUR,Republican
40526,2016,ALABAMA,AL,BIBB,1007.0,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,1874,8819,20220315,TOTAL,2016ALABAMABIBB,Republican
40529,2016,ALABAMA,AL,BLOUNT,1009.0,US PRESIDENT,HILLARY CLINTON,DEMOCRAT,2156,25588,20220315,TOTAL,2016ALABAMABLOUNT,Republican
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72597,2020,WYOMING,WY,SWEETWATER,56037.0,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,3823,16698,20220315,TOTAL,2020WYOMINGSWEETWATER,Republican
72601,2020,WYOMING,WY,TETON,56039.0,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,9848,14787,20220315,TOTAL,2020WYOMINGTETON,Democrat
72605,2020,WYOMING,WY,UINTA,56041.0,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,1591,9459,20220315,TOTAL,2020WYOMINGUINTA,Republican
72609,2020,WYOMING,WY,WASHAKIE,56043.0,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,651,4032,20220315,TOTAL,2020WYOMINGWASHAKIE,Republican


In [115]:
cleaned_census_data = pd.merge(census, winners_pd, how='left', left_on=['year','state', 'county_name'], right_on=['year','state', 'county_name'])

cleaned_census_data = cleaned_census_data[['year','state','state_po','county_name','Population','Median Age', 'Household Income', 'Per Capita Income', 'Poverty Rate', 'Winner']]
cleaned_census_data

Unnamed: 0,year,state,state_po,county_name,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Winner
0,2016,MISSOURI,MO,WRIGHT,18378.0,41.9,30581.0,18502.0,24.812276,Republican
1,2016,MISSOURI,MO,CLAY,233135.0,36.8,63702.0,30531.0,8.950608,Republican
2,2016,MISSOURI,MO,DENT,15578.0,43.1,38020.0,19929.0,21.992554,Republican
3,2016,MISSOURI,MO,SALINE,23214.0,37.4,40645.0,20446.0,16.145429,Republican
4,2016,MISSOURI,MO,WORTH,2061.0,47.9,44974.0,23292.0,12.712276,Republican
...,...,...,...,...,...,...,...,...,...,...
6435,2020,MINNESOTA,MN,RENVILLE,14572.0,44.0,58542.0,31243.0,9.422180,Republican
6436,2020,MINNESOTA,MN,ROSEAU,15259.0,41.6,62304.0,31452.0,7.425126,Republican
6437,2020,MINNESOTA,MN,SHERBURNE,96015.0,36.1,88671.0,36022.0,5.158569,Republican
6438,2020,MINNESOTA,MN,STEELE,36710.0,39.2,68172.0,34648.0,7.864342,Republican


In [122]:
cleaned_census_data.dropna(subset =["state_po"],inplace=True)
# cleaned_census_data.drop_duplicates()
# cleaned_census_data

In [123]:
cleaned_census_data.groupby('year').sum()

Unnamed: 0_level_0,Population,Median Age,Household Income,Per Capita Income,Poverty Rate
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016,307582561.0,123750.4,143834484.0,74999435.0,47062.421839
2020,315412439.0,125295.5,165020057.0,87102240.0,41776.289741


In [121]:
cleaned_census_data.to_csv("clean_data.csv", encoding="utf-8", index=False)

In [124]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [126]:
X = cleaned_census_data.drop(['year','state', 'county_name','Winner'], axis=1)
X.head()

Unnamed: 0,state_po,Population,Median Age,Household Income,Per Capita Income,Poverty Rate
0,MO,18378.0,41.9,30581.0,18502.0,24.812276
1,MO,233135.0,36.8,63702.0,30531.0,8.950608
2,MO,15578.0,43.1,38020.0,19929.0,21.992554
3,MO,23214.0,37.4,40645.0,20446.0,16.145429
4,MO,2061.0,47.9,44974.0,23292.0,12.712276


In [128]:
y = cleaned_census_data["Winner"].values

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [130]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler()
X_scaler.fit(X_train)

ValueError: could not convert string to float: 'KS'

In [131]:
# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'KS'

In [132]:
# Loop through different k values to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy score")
plt.show()

NameError: name 'X_train_scaled' is not defined

In [133]:
# Note that k: 11 seems to be the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

NameError: name 'X_train_scaled' is not defined