# COVID-19

## DS100 Final Project

Gene Ho, Hannah Qi, Jasmine Wu

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from datetime import datetime
import datetime as dt

import sklearn.linear_model as lm
import sklearn

# import geopandas

from IPython.display import display, Markdown, Latex, Math

## Research Questions

<!-- OLD QUESTIONS: 1. How does the timing of the implementation of various social distancing measures (i.e. time to issue) impact the growth rate of cases?
2. Who would be considered as a highly susceptible population of COVID-19? What characteristics make people more susceptible to COVID-19? -->

1. How does the timing of the implementation of various social distancing measures (i.e. time to issue) predict a county's democrat to republican ratio?
2. How does the timing of the implementation of various social distancing measures (i.e. time to issue) predict a county's political party majority?

## Data Cleaning

In [None]:
counties = pd.read_csv("./covid-data/abridged_counties.csv")

In [None]:
counties.head()

In [None]:
counties.columns

In [None]:
confirmed_df = pd.read_csv("./covid-data/time_series_covid19_confirmed_US.csv")
confirmed_df.head(10)

In [None]:
# created a new table
df = counties[[
    'countyFIPS', 'STATEFP', 'COUNTYFP', 'CountyName', 'StateName', 'State',
    'lat', 'lon', 'POP_LATITUDE', 'PopulationEstimate2018', 'POP_LONGITUDE',
    'dem_to_rep_ratio', 'stay at home', '>50 gatherings', '>500 gatherings',
    'public schools', 'restaurant dine-in', 'entertainment/gym',
    'federal guidelines', 'foreign travel ban'
]]

In [None]:
df.head(5)

In [None]:
# from https://gist.github.com/JeffPaine/3083347
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands': 'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

df["State"] = df['StateName'].map(dict(map(reversed, us_state_abbrev.items())))

In [None]:
#a function to convert times from Gregorian ordinal date to pandas Timestamp
def convert_time(df, columns):
    data = df.copy()

    if isinstance(columns, list):
        for col in columns:
            if not isinstance(data[col][0], pd.Timestamp):
                data[col] = data[col].dropna().astype(int).map(
                    pd.Timestamp.fromordinal)
    else:
        if not isinstance(data[columns][0], pd.Timestamp):
            data[columns] = data[columns].dropna().astype(int).map(
                pd.Timestamp.fromordinal)
    return data

In [None]:
# apply func to the dataframe
df = convert_time(df, [
    'stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
    'restaurant dine-in', 'entertainment/gym', 'federal guidelines',
    'foreign travel ban'
])
df

In [None]:
#dropping NYC and Kansas City because they are cities not counties
df.drop([3242, 3243], inplace=True)
# df[df['CountyName'].isna()]


# NYC is **not** a county so we are expecting an empty df
df[df['CountyName'] == 'New York City']

In [None]:
df.head()

In [None]:
# from urllib.request import urlopen
# import json
# with urlopen(
#         'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
# ) as response:
#     counties = json.load(response)

In [None]:
# # county population visualization
# import plotly.express as px

# df['log_pop'] = np.log(df['PopulationEstimate2018'])

# fig = px.choropleth_mapbox(df,
#                            geojson=counties,
#                            locations='countyFIPS',
#                            color='log_pop',
#                            color_continuous_scale="Viridis",
#                            range_color=(0, 13),
#                            mapbox_style="white-bg",
#                            hover_name=df['CountyName'],
#                            zoom=3,
#                            center={
#                                "lat": 37.0902,
#                                "lon": -95.7129
#                            },
#                            opacity=0.5,
#                            labels={'log_pop': 'log(Population)'},
#                            title='Population of US Counties in 2018')

# fig.update_layout(margin={"r": 0, "l": 0, "b": 0})
# fig.show()

In [None]:
#creating a new column "new FIPS" of df's "countyFIPS" as integer
df = df[df['countyFIPS'].notna()]
# df.astype({'countyFIPS':'int32'}).dtypes
df['new FIPS'] = df['countyFIPS'].astype('int32')
df

In [None]:
# creating a new column "new FIPS" of confimed_df's "FIP" as integer

confirmed_df = confirmed_df.rename(columns={'Admin2': 'CountyName'})
confirmed_df = confirmed_df[confirmed_df['FIPS'].notna()]
confirmed_df["new FIPS"] = confirmed_df['FIPS'].astype('int32')
confirmed_df

In [None]:
# joining df table and confirmed_df table by "new FIPS"
joint_table = pd.merge(df, confirmed_df, how='inner', on=['new FIPS'])

In [None]:
## column index of new FIPS and dates (i.e. simplified confirmed df)
# joint_table.columns.get_loc("new FIPS") # 20
# joint_table.columns.get_loc("4/18/20") # 119
# joint_table.columns.get_loc("1/22/20") # 32
another_attempt_first = joint_table.iloc[:, np.r_[20, 32:119]]

another_attempt_first.head()

In [None]:
#function that catches each county's date of first case. if no date exists for that county, its first date will be recorded as 4/19/20
def first_case(df):
    firstcasedate_list = []
    for i in df.index:
        if df.iloc[i, len(df.columns) - 1] == 0:
            firstcasedate_list.append(
                '4/19/20'
            )  #equivalent to NA value since it is not included in the df
            continue
        for j in range(1, len(df.columns)):
            if df.iloc[i, j] > 0:
                date = df.columns[j]
                firstcasedate_list.append(date)
                break
    return firstcasedate_list

In [None]:
#joint table with new column of each county's date of first case
joint_table["Date of First Case"] = pd.Series(
    first_case(another_attempt_first))

joint_table

In [None]:
# changing the format of date in the joint_table
joint_table["Date of First Case"] = pd.to_datetime(
    joint_table["Date of First Case"]).dt.strftime('%Y-%m-%d')
joint_table.tail()

In [None]:
#a new dataframe with more relevant columns
new_df = joint_table[[
    'countyFIPS', 'CountyName_x', 'StateName', 'State', 'dem_to_rep_ratio',
    'Date of First Case', 'stay at home', '>50 gatherings', '>500 gatherings',
    'public schools', 'restaurant dine-in', 'entertainment/gym'
]]
new_df

In [None]:
# print(new_df['dem_to_rep_ratio'].isna().sum()) # 26 counties without our outcome

new_df.dropna(subset=['dem_to_rep_ratio'], inplace=True)
#we had to drop all the rows that had NAN in the 'dem_to_rep_ratio' column
#some were counties from Alaska and Hawaii

In [None]:
# some counties did not have a date of order for 'stay at home', '>50 gatherings', '>500 gatherings', and 'entertainment/gym'
# assume that they implemented it after the last day of data collection
# impute with last date + 1 = April 19, 2020
new_df['Date of First Case'] = pd.to_datetime(new_df['Date of First Case'])
# april_19 = dt.datetime.strptime("2020-04-19", '%Y-%m-%d')
april_19 = pd.to_datetime("2020-04-19")

var_columns = [
    'Date of First Case', 'stay at home', '>50 gatherings', '>500 gatherings',
    'public schools', 'restaurant dine-in', 'entertainment/gym'
]

new_df[var_columns].apply(pd.to_datetime, errors='coerce')

new_df = new_df.fillna(april_19)

In [None]:
#making sure there are no more NAN dates for the orders
new_df.isna().sum()

In [None]:
new_df

In [None]:
# number of days between date of first case and order (order - date of first case)

for column in var_columns[1:]:
    new_df.loc[:, column] = pd.to_datetime(new_df.loc[:, column])
    new_df[column] = new_df[column] - new_df['Date of First Case']
    new_df[column] = new_df[column].dt.days
#     new_df[column] = new_df[column].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)

In [None]:
new_df.head()

In [None]:
#a final dataframe
final_df = new_df[[
    'countyFIPS', 'CountyName_x', 'StateName', 'dem_to_rep_ratio',
    'Date of First Case', 'stay at home', '>50 gatherings', '>500 gatherings',
    'public schools', 'restaurant dine-in', 'entertainment/gym'
]]

final_df = final_df.rename(columns = {'CountyName_x' : 'CountyName'})

print(final_df.shape[0], "counties in our df")
final_df.head()

# negative values means county was proactive and implemented it before first case

## EDA

### Order of Orders

In [None]:
# order by which order is implemented first

temp = np.flip(
    final_df[[
        'stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
        'restaurant dine-in', 'entertainment/gym'
    ]].values.argsort(), 1)
ndf = pd.DataFrame(final_df[[
    'stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
    'restaurant dine-in', 'entertainment/gym'
]].columns[temp])

# final_df.columns

ndf.head()

In [None]:
orders_order = pd.DataFrame()
for col in ndf.columns:
    orders_order[col] = ndf[col].value_counts()

In [None]:
orders_order = orders_order.fillna(0).rename(columns={
    0: 1,
    1: 2,
    2: 3,
    3: 4,
    4: 5,
    5: 6
})

orders_order = orders_order.reset_index().melt('index',
                                               var_name='cols',
                                               value_name='vals')

In [None]:
orders_order.head()

In [None]:
# sns.distplot(orders_order.columns, hist_kws={"weights":orders_order.values()})


# not sure if this way makes sense
# can potentially swap "cols" and "index"
order_bar = sns.catplot(x="cols", y="vals", hue='index', data=orders_order, kind = "bar", orient = "v")

# order_bar.set_xticklabels(rotation=90)

plt.title("Frequency of Order of COVID-19 Orders")

order_bar.set_xticklabels(['1st', '2nd', '3rd', '4th', '5th', '6th'])
plt.xlabel("Order Number")
plt.ylabel("Count")
order_bar._legend.set_title("Order")

;

### Outcome Variable: Democrat to Republican Ratio

In [None]:
# summary statistics for our outcome of interest

print(min(final_df['dem_to_rep_ratio']), np.median(final_df['dem_to_rep_ratio']),
      np.mean(final_df['dem_to_rep_ratio']),
      max(final_df['dem_to_rep_ratio']))

In [None]:
# taking a look at the outliers

# to get the 25th percentile/first quartile
Q1 = np.percentile(final_df['dem_to_rep_ratio'], 25)
# to get the 75th percentile/third quartile
Q3 = np.percentile(final_df['dem_to_rep_ratio'], 75)
# to get IQR
dem_rep_iqr = Q3- Q1 #0.461
# to get outliers
outliers_below_Q1 = Q1 - 1.5*dem_rep_iqr # -0.42
outliers_above_Q3 = Q3 + 1.5*dem_rep_iqr # 1.42

# outlier counties
# (final_df[final_df['dem_to_rep_ratio'] <= outliers_below_Q1]).sort_values(
#     by='dem_to_rep_ratio') 
# There is no couties below the first interquartile ranges
(final_df[final_df['dem_to_rep_ratio'] >= outliers_above_Q3]).sort_values(
    by='dem_to_rep_ratio')

In [None]:
f, axes = plt.subplots(1, 2)

sns.distplot(final_df['dem_to_rep_ratio'] , kde = False, norm_hist = True, ax=axes[0]) # can't see the max value of 22...some more digging is needed

sns.boxplot(final_df['dem_to_rep_ratio'],  orient='v' , ax=axes[1]) # lots of outliers that are 1.5 IQR above the third quartile
;

# data appears biased since democrats are in the numerator

# even though there is a lot of skew, we can use a linear regression
# and assume normality due to the central limit theorm

### Log Transforming

In [None]:
# Based on our summary statistics, our max greater than our minimum by a magnitude of 1,000
# Let's see if log transforming does anything

f, axes = plt.subplots(1, 2)

sns.distplot(np.log(final_df['dem_to_rep_ratio']) , kde = False, norm_hist = True, ax=axes[0]) 

sns.boxplot(np.log(final_df['dem_to_rep_ratio']),  orient='v' , ax=axes[1]) # outliers are on both sides of boxplot 
;

In [None]:
final_df['log_ratio'] = np.log(final_df['dem_to_rep_ratio'])

final_df.head()

In [None]:
#the average log of the dem_to_rep_ratio
np.mean(np.log(final_df['dem_to_rep_ratio']))

In [None]:
# counties that are strongly Republican (1.5 IQR under the first quartile)
first_quartile_for_log = np.percentile(final_df['log_ratio'], 25)
third_quartile_for_log = np.percentile(final_df['log_ratio'], 75)
dem_rep_log_iqr = third_quartile_for_log - first_quartile_for_log

print("First Quartile: ", first_quartile_for_log)

log_very_rep = (final_df[final_df['log_ratio'] <= (
    first_quartile_for_log - 1.5*dem_rep_log_iqr)]).sort_values(
        by='log_ratio', ascending=True)

log_very_rep.head()

In [None]:
# counties that are strongly Democratic (1.5 IQR above the third quartile)
print("Third Quartile: ", third_quartile_for_log)

log_very_dem = (final_df[final_df['log_ratio'] >= (
    third_quartile_for_log + 1.5*dem_rep_log_iqr)]).sort_values(
        by='log_ratio', ascending=False)

log_very_dem.head()

In [None]:
## TO DO:
# Create another cloropleth with log transformed ratio

## Feature Assessment

In [None]:
# checking the correlation matrix of our variables

sns.heatmap(final_df[['dem_to_rep_ratio', 'stay at home', '>50 gatherings',
       '>500 gatherings', 'public schools', 'restaurant dine-in',
       'entertainment/gym', 'log_ratio']].corr(), annot=True)
;

## top right and bottom left have high correlation because
## they are transformations of each other

## variables have virtually no correlation with direct ratio
## variables have low correlation [0.3, 0.5] with log-transformed
## source: https://www.andrews.edu/~calkins/math/edrm611/edrm05.htm
## probs should stick witht the log-transformed version

In [None]:
# indepdent_vars.columns

indepdent_vars = final_df[['stay at home', '>50 gatherings',
       '>500 gatherings', 'public schools', 'restaurant dine-in',
       'entertainment/gym']]

In [None]:
#a function to find each feature's variation inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df, drop_cols = None):
    if drop_cols is not None:
        df = df.drop(columns = drop_cols)
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif["features"] = df.columns

    return vif

In [None]:
indepdent_vars = final_df.iloc[:, 5:11]


calculate_vif(indepdent_vars)

In [None]:
calculate_vif(indepdent_vars, '>50 gatherings')

In [None]:
calculate_vif(indepdent_vars, ['>50 gatherings', 'restaurant dine-in'])

In [None]:
# checking our correlation matrix again
sns.heatmap(final_df[['dem_to_rep_ratio', 'stay at home', 
       '>500 gatherings', 'public schools', 
       'entertainment/gym', 'log_ratio']].corr(), annot=True)

;

## Making Our Model

In [None]:
# split the data set into a training set and test set

from sklearn.model_selection import train_test_split
np.random.seed(47)

X = final_df[[
    'stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
    'restaurant dine-in', 'entertainment/gym'
]]
Y = pd.Series(final_df['dem_to_rep_ratio'])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)
#X_train, y_train

In [None]:
#first linear model
import sklearn.linear_model as lm

linear_model = lm.LinearRegression()

# Fit your linear model
linear_model.fit(X_train, Y_train)
Y_pred = linear_model.predict(X_train)

In [None]:
#scatterplot of training set of first linear model

import matplotlib.pyplot as plt
plt.scatter(Y_train, linear_model.predict(X_train), alpha=0.5)
plt.xlabel("Democratic to Republic Ratio of the Training Set")
plt.ylabel("Predicted Democratic to Republic Ratio of the Training Set")
plt.title(
    "Democratic to Republic Ratio vs Predicted Democratic to Republic Ratio of the Training Set"
)

In [None]:
#function to calculate RMSE
def rmse(actual, predicted):
    return np.sqrt(np.mean((actual - predicted)**2))

In [None]:
# error of our model
train_error = rmse(Y_train, Y_pred)

print("Training RMSE:", train_error) # eh this is OK...will making it log-linear help?

### With Feature Selection

In [None]:
#second training and training sets with select features
X_select = final_df[[
    'stay at home', '>500 gatherings', 'public schools',
    'entertainment/gym'
]]
Y = pd.Series(final_df['dem_to_rep_ratio'])

X_train_select, X_test_select, Y_train_select, Y_test_select = train_test_split(X_select, Y, test_size=0.1)


In [None]:
#second model
select_linear_model = lm.LinearRegression()

# Fit your linear model
select_linear_model.fit(X_train_select, Y_train_select)
Y_pred_select = select_linear_model.predict(X_train_select)

In [None]:
# error of our model
select_train_error = rmse(Y_train_select, Y_pred_select)

print("Selected Feature Training RMSE:", select_train_error)

## With the Log-Transformed Outcome

In [None]:
#log of dem_to_rep_ratio as new Y
log_X = final_df[[
    'stay at home', '>500 gatherings', 'public schools',
    'entertainment/gym'
]]

log_Y = pd.Series(final_df['log_ratio'])

log_X_train, log_X_test, log_Y_train, log_Y_test = train_test_split(log_X, log_Y, test_size=0.1) # 90:10 split since dataset is large
#X_train, y_train

log_X

In [None]:
#third model
log_linear_model = lm.LinearRegression()

# Fit your linear model
log_linear_model.fit(log_X_train, log_Y_train)
log_Y_pred = log_linear_model.predict(log_X_train)

In [None]:
#error of the third model
log_train_error = rmse(log_Y_train, log_Y_pred)

print("Log Training RMSE:", log_train_error)

In [None]:
print("Training RMSE:", train_error)
print("Selected Feature Training RMSE:", select_train_error)
print("Log Training RMSE:", log_train_error) # best model!

In [None]:
#scatterplot of our third model
plt.scatter(log_Y_train, log_Y_pred, alpha=0.5)
plt.xlabel("Democratic to Republic Ratio of the Training Set")
plt.ylabel("Predicted Democratic to Republic Ratio of the Training Set")
plt.title(
    "Democratic to Republic Ratio vs Predicted Democratic to Republic Ratio of the Training Set"
)
;

## Predicting Political Party Majority

In [None]:
final_df.loc[final_df['dem_to_rep_ratio'] > 1, 'majority'] = 0 # Democratic Majority = 0
final_df.loc[final_df['dem_to_rep_ratio'] < 1, 'majority'] = 1 # Republican Majority = 1

final_df = final_df.astype({'majority': 'int32'})

In [None]:
final_df = final_df[['countyFIPS', 'CountyName', 'StateName', 'dem_to_rep_ratio',
                      'log_ratio', 'majority',
                      'Date of First Case', 'stay at home', '>50 gatherings',
                      '>500 gatherings', 'public schools', 'restaurant dine-in',
                      'entertainment/gym']]

final_df.head()

In [None]:
#creating the majority column and training and testing sets
logm_X = final_df[[
    'stay at home', '>500 gatherings', 'public schools',
    'entertainment/gym'
]]

logm_Y = pd.Series(final_df['majority'])


logm_X_train, logm_X_test, logm_Y_train, logm_Y_test = train_test_split(logm_X, logm_Y, test_size=0.1) # 90:10 split since dataset is large


In [None]:
#logistic regression
log_model = lm.LogisticRegression()

log_model = log_model.fit(logm_X_train, logm_Y_train)

logm_Y_pred = log_model.predict(logm_X_train)

In [None]:
#accuracy of logistic model
logisticm_training_accuracy = sum(logm_Y_pred == logm_Y_train)/len(logm_X_train)

print("Logistic Model Training Accuracy:", logisticm_training_accuracy)


## Validing Our Model with the Test Data
How well did we do?

### Log-Linear Model

In [None]:
test_error = rmse(log_Y_test, log_linear_model.predict(log_X_test))
print("Test RMSE:", test_error) # want lower error

In [None]:
print(log_linear_model.intercept_)  
print(log_linear_model.coef_)

In [None]:
m1, m2, m3, m4 = log_linear_model.coef_
b = log_linear_model.intercept_
print('dem_to_rep_ratio = {0} + {1} * stay at home + {2} * >500 gatherings + {3} * public schools + {4} * entertainment/gym'.format(round(b, 3), round(m1, 3), round(m2, 3), round(m3, 3), round(m4, 3)))

In [None]:
display(
    Latex(
        r'\text{{log}}(\texttt{{dem\_to\_rep\_ratio}}) = {0} + {1} * \texttt{{stay at home}} + {2} * \texttt{{>500 gatherings}} + {3} * \texttt{{public schools}} + {4} * \texttt{{entertainment/gym}}'.format(round(b, 3), round(m1, 3), round(m2, 3), round(m3, 3), round(m4, 3))
    )
)

### Logistic Model

In [None]:
#scatterplot of testing set
plt.scatter(log_Y_test, log_model.predict(logm_X_test), alpha=0.5)
plt.xlabel("Democratic to Republic Ratio")
plt.ylabel("Predicted Democratic to Republic Ratio")
plt.title(
    "Democratic to Republican Ratio vs Predicted Democratic to Republican Ratio of Test Set Using a Log-Linear Model"
)

In [None]:
logm_accuracy = log_model.score(logm_X_test, logm_Y_test)
print("Test Accuracy:", logm_accuracy) # want higher accuracy


In [None]:
print(log_model.intercept_)  
print(log_model.coef_)

In [None]:
m1, m2, m3, m4 = log_model.coef_[0]
b = log_model.intercept_[0]

In [None]:
display(
    Latex(
        r'\text{{logit}}(\texttt{{majority}}) = {0} + {1} * \texttt{{stay at home}} + {2} * \texttt{{>500 gatherings}} + {3} * \texttt{{public schools}} + {4} * \texttt{{entertainment/gym}}'.format(round(b, 3), round(m1, 3), round(m2, 3), round(m3, 3), round(m4, 3))
    )
)

## Data Visualizations
### COVID-19 Landscape

In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [None]:
import plotly.express as px

df['log_pop'] = np.log(df['PopulationEstimate2018'])

fig_pop = px.choropleth_mapbox(df, geojson=counties, locations='countyFIPS', color='log_pop',
                           color_continuous_scale="Viridis",
                           range_color=(0, 13),
                           mapbox_style="white-bg",
                           hover_name = df['CountyName'],
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'log_pop':'log(Population)'}, 
                           title = 'Population of US Counties in 2018'
                          )


fig_pop.update_layout(margin={"r":0,"l":0,"b":0})
fig_pop.show()

In [None]:
fig_log_party = px.choropleth_mapbox(final_df, geojson=counties, locations='countyFIPS', color='log_ratio',
                           color_continuous_scale="bluered_r",
                           range_color=(-4, 4),
                           mapbox_style="white-bg",
                           hover_name = final_df['CountyName'],
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'log_ratio':'log(Dem:Rep Ratio)'}, 
                           title = 'log(Democrat to Republican Ratio) of US Counties in 2016 Election'
                          )


fig_log_party.update_layout(margin={"r":0,"l":0,"b":0})
fig_log_party.show()

In [None]:



fig_majority = px.choropleth_mapbox(final_df.replace({'majority' : {0 : "Democratic", 1 : "Republican"}}), geojson=counties, locations='countyFIPS', color='majority',
                        #    color_continuous_scale="bluered",
                           color_discrete_map = {"Democratic" : "#0000ff", "Republican" : "#ff0000"},
                        #    range_color=(0, 1),
                           mapbox_style="white-bg",
                           hover_name = final_df['CountyName'],
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                        #    labels={'log_ratio':'log(Dem:Rep Ratio)'}, 
                           title = 'Political Party Majority of US Counties in 2016 Election'
                          )


fig_majority.update_layout(margin={"r":0,"l":0,"b":0}, coloraxis_showscale=True, showlegend=True, )
fig_majority.show()