In [1]:
# dependancies
# possibly more than needed
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import linregress
import chardet
mildProfanity = "Rats."

In [2]:
# 1. US Census 2010-2019
# This stores the URL the file came from, in case you want to get it yourself directly from where I did...:
censusDataOrigURL = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv" 

# ...and this stores the URL of information about the file, as well as methodology:
censusDataReadMeURL = "https://www.census.gov/data/tables/time-series/demo/popest/2010s-counties-total.html"

# This sets the filepath where the census data .CSV lives locally...:
censusDataFilepath = "Resources/co-est2019-alldata.csv"

# ...and this tests that the filepath works,
# in that it finds a file of a certain size in megabytes (MB):
print(f"The file at {censusDataFilepath} is {round(os.path.getsize(censusDataFilepath)/1024/1024, 2)} megabytes (MB).\nIt came from:\n{censusDataOrigURL}\nMore info here:\n{censusDataReadMeURL}")

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'Resources/co-est2019-alldata.csv'

In [3]:
# this gives a spooky error I asked Dom about that looks like it has to do with encoding:
# "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 2: invalid continuation byte"
censusData = pd.read_csv(censusDataFilepath)

FileNotFoundError: [Errno 2] File Resources/co-est2019-alldata.csv does not exist: 'Resources/co-est2019-alldata.csv'

In [4]:
# And this errors out: "NameError: name 'censusData' is not defined"
# because Python couldn't read the CSV file into the variable censusData
print(censusData).head()

NameError: name 'censusData' is not defined

In [None]:
# Here's a savvy-looking piece of code from
# https://krinkere.github.io/krinkersite/encoding_csv_file_python.html
# that looks at the first ten thousand bytes of the file to guess the character encoding

with open(censusDataFilepath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
print(result)

# for more on encodings, see:
# https://docs.python.org/3/library/codecs.html#standard-encodings

In [None]:
# Krinkere's chardet suggests 100% confidence this file is encoded as ASCII text,
# so let's try specifying ASCII:
censusData = pd.read_csv(censusDataFilepath, encoding="ascii")
# print(censusData).head()

In [None]:
# But that errors out too:
# "UnicodeDecodeError: 'ascii' codec can't decode byte 0xf1 in position 253967: ordinal not in range(128)"

In [None]:
print(mildProfanity)

In [None]:
# But let's keep trying:
# 2. COVID-19 cases
# This stores the URL the file came from, in case you want to get it yourself directly from where I did...:
caseDataOrigURL = "https://github.com/nytimes/covid-19-data/blob/master/live/us-counties.csv"

# ...and this stores the URL of information about the file, as well as methodology:
caseDataReadMeURL = "https://github.com/nytimes/covid-19-data/blob/master/README.md"

# This sets the filepath where the census data .CSV lives locally...:
caseDataFilepath = "Resources/us-counties.csv"

# ...and this tests that the filepath works,
# in that it finds a file of a certain size in megabytes (MB):
print(f"The file at {caseDataFilepath} is {round(os.path.getsize(caseDataFilepath)/1024/1024, 2)} MB.\nIt came from:\n{caseDataOrigURL}\nMore info here:\n{caseDataReadMeURL}")

In [None]:
# How does Krinkere's chardet think this file is encoded?
with open(caseDataFilepath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
# check what the character encoding might be
print(result)

In [None]:
# Krinkere's chardet is 75% confident this file is encoded as UTF-8.
# So we shouldn't have to specify encoding because UTF-8 is Python's default,
# but watch what happens when we try to read the file into a variable:
# caseData = pd.read_csv(caseDataFilepath)
# A spooky error:
# "ParserError: Error tokenizing data. C error: Expected 1 fields in line 70, saw 2"

In [None]:
print(mildProfanity)
# And, as before, printing the head of a variable whose contents couldn't be imported:
# print(caseData).head()
# returns "NameError: name 'caseData' is not defined"

In [None]:
# There's a 25% chance Krinkere's chardet is wrong about UTF-8, so maybe try different encoding:
# caseData = pd.read_csv(caseDataFilepath, encoding = "ascii")
# This gives a different error:
# "UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 6723: ordinal not in range(128)"

In [None]:
# Maybe there's something to be gained from telling Python explicitly it's a file encoded as UTF-8:
# caseData = pd.read_csv(caseDataFilepath, encoding = "utf-8")
# But alas, that give the same parser error as when we let Python assume it should use UTF-8 encoding:
# "ParserError: Error tokenizing data. C error: Expected 1 fields in line 70, saw 2"

In [None]:
# 3. mask-wearing survey

# This stores the URL the file came from, in case you want to get it yourself directly from where I did...:
maskWearingDataOrigURL = "https://github.com/nytimes/covid-19-data/blob/bde13b021e99c6b4a63fb66a6144e889cc635e31/mask-use/mask-use-by-county.csv"

# ...and this stores the URL of information about the file, as well as methodology:
maskWearingDataReadMeURL = "https://github.com/nytimes/covid-19-data/blob/master/README.md"

# This sets the filepath where the census data .CSV lives locally...:
maskWearingDataFilepath = "Resources/mask-use-by-county.csv"

# ...and this tests that the filepath works,
# in that it finds a file of a certain size in megabytes (MB):
print(f"The file at {maskWearingDataFilepath} is {round(os.path.getsize(maskWearingDataFilepath)/1024/1024, 2)} MB.\nIt came from:\n{maskWearingDataOrigURL}\nMore info here:\n{maskWearingDataReadMeURL}")

In [None]:
# Krinkere's chardet
with open(maskWearingFilepath, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

In [None]:
maskWearingData = pd.read_csv(maskWearingFilepath, encoding="iso-8859-1")
print(maskWearingData).head()