In [115]:
import pandas as pd
import plotly as ply
import glob
from collections import defaultdict
import re
from datetime import datetime

In [10]:
## apparently, pandas is not enough to read excel files
## had to install openpyxl also
## run only once
# !pip3 install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 2.0 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [None]:
## set global variables
yelp_cities = ['Montreal', 'Calgary', 'Toronto', 'Pittsburgh', 'Charlotte', 'Urbana-Champaign', 'Phoenix',
              'Las Vegas', 'Madison', 'Cleveland']
yelp_states = ['PA', 'NC', 'IL', 'AZ', 'NV', 'WI', 'OH']

In [65]:
# read in files using glob
dpath = "/Users/christinabrady/Documents/data/census_pulse_biz/*.xlsx"
fpaths = glob.glob(dpath)

# separate files in to Metropolitan Areas and Sectors because the include different data
msas = [fl for fl in fpaths if "msa" in fl]
sector = [fl for fl in fpaths if "empclass" in fl]

census_dates = {"start": [], 
               "end": []}

for fl in msas:
    import re
    dts = re.search(r"([0-9]{2}[A-Z][a-z].[0-9]{2})_([0-9]{2}[A-Z][a-z].[0-9]{2})", fl)
    census_dates["start"].append(dts.group(1))
    census_dates["end"].append(dts.group(2))
print(census_dates)

## convert end dates to datetime for plotting
enddates = [datetime.strptime(dt, "%d%b%y").date() for dt in census_dates["end"]]


{'start': ['17May20', '24May20', '26Apr20', '21Jun20', '10May20', '14Jun20', '07Jun20', '31May20', '03May20'], 'end': ['23May20', '30May20', '02May20', '27Jun20', '16May20', '20Jun20', '13Jun20', '06Jun20', '09May20']}


In [117]:
df = pd.read_excel(msas[0])
df = df.set_index(["MSA"])

In [81]:
df.head()

Unnamed: 0_level_0,CBSA_CODE,INSTRUMENT_ID,QUESTION,ANSWER_ID,ANSWER_TEXT,ESTIMATE_PERCENTAGE,SE
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Atlanta-Sandy Springs-Alpharetta, GA MSA",12060,1,"Overall, how has this business been affected b...",1,Large negative effect,45.3%,1.78%
"Atlanta-Sandy Springs-Alpharetta, GA MSA",12060,1,"Overall, how has this business been affected b...",2,Moderate negative effect,43.1%,1.67%
"Atlanta-Sandy Springs-Alpharetta, GA MSA",12060,1,"Overall, how has this business been affected b...",3,Little or no effect,8.4%,1.91%
"Atlanta-Sandy Springs-Alpharetta, GA MSA",12060,2,"In the last week, did this business experience...",1,"Yes, increased",14.7%,1.57%
"Atlanta-Sandy Springs-Alpharetta, GA MSA",12060,2,"In the last week, did this business experience...",2,"Yes, decreased",59.8%,3.09%


**There is no date in the data, so I'll have to parse it from the document title.**

Let's see what cities the Census MSA data set covers.

In [84]:
census_cities = df.index.unique()
print(census_cities)

Index(['Atlanta-Sandy Springs-Alpharetta, GA MSA',
       'Austin-Round Rock-Georgetown, TX MSA',
       'Baltimore-Columbia-Towson, MD MSA', 'Birmingham-Hoover, AL MSA',
       'Boston-Cambridge-Newton, MA-NH MSA', 'Buffalo-Cheektowaga, NY MSA',
       'Charlotte-Concord-Gastonia, NC-SC MSA',
       'Chicago-Naperville-Elgin, IL-IN-WI MSA', 'Cincinnati, OH-KY-IN MSA',
       'Cleveland-Elyria, OH MSA', 'Columbus, OH MSA',
       'Dallas-Fort Worth-Arlington, TX MSA', 'Denver-Aurora-Lakewood, CO MSA',
       'Detroit-Warren-Dearborn, MI MSA',
       'Hartford-East Hartford-Middletown, CT MSA',
       'Houston-The Woodlands-Sugar Land, TX MSA',
       'Indianapolis-Carmel-Anderson, IN MSA', 'Jacksonville, FL MSA',
       'Kansas City, MO-KS MSA', 'Las Vegas-Henderson-Paradise, NV MSA',
       'Los Angeles-Long Beach-Anaheim, CA MSA',
       'Louisville/Jefferson County, KY-IN MSA', 'Memphis, TN-MS-AR MSA',
       'Miami-Fort Lauderdale-Pompano Beach, FL MSA',
       'Milwaukee-Waukesha,

In [30]:
matches = defaultdict(list)
for ycity in yelp_cities:
    for ccity in census_cities:
        if ycity in ccity:
            matches[ycity].append(ccity)

In [89]:
print("There are", len(matches), "matching cities.")
print(matches.keys())
print(matches.values())

There are 5 matching cities.
dict_keys(['Pittsburgh', 'Charlotte', 'Phoenix', 'Las Vegas', 'Cleveland'])
dict_values([['Pittsburgh, PA MSA'], ['Charlotte-Concord-Gastonia, NC-SC MSA'], ['Phoenix-Mesa-Chandler, AZ MSA'], ['Las Vegas-Henderson-Paradise, NV MSA'], ['Cleveland-Elyria, OH MSA']])


What questions do we want to look at?

In [67]:
df.QUESTION.unique()

array(['Overall, how has this business been affected by the COVID-19 pandemic?',
       'In the last week, did this business experience a change in operating revenues/sales/receipts, not including any financial assistance or loans?',
       'In the last month, what were the total operating revenues/sales/receipts for this business, not including any financial assistance or loans?',
       'In the last week, did this business temporarily close any of its locations for at least one day?',
       'In the last week, did this business have a change in the number of paid employees?',
       'In the last week, did this business have a change in the total number of hours worked by paid employees?',
       'In the last week, did this business have disruptions in its supply chain?',
       'In the last week, did this business shift to the production of other goods or services?',
       "In the last week, did any of this business's locations adopt pickup/carry-out/delivery as their only means of 

In [111]:
flattened_ccities = [city for cities in matches.values() for city in cities]
city_interest_df = df.loc[flattened_ccities]
city_interest_df[city_interest_df.INSTRUMENT_ID == 1 & city_interest_df.ANSWER_ID.isin([1,2])]


Unnamed: 0_level_0,CBSA_CODE,INSTRUMENT_ID,QUESTION,ANSWER_ID,ANSWER_TEXT,ESTIMATE_PERCENTAGE,SE
MSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Pittsburgh, PA MSA",38300,1,"Overall, how has this business been affected b...",1,Large negative effect,47.4%,5.01%
"Pittsburgh, PA MSA",38300,1,"Overall, how has this business been affected b...",2,Moderate negative effect,47.1%,5.06%
"Charlotte-Concord-Gastonia, NC-SC MSA",16740,1,"Overall, how has this business been affected b...",1,Large negative effect,44.0%,5.20%
"Charlotte-Concord-Gastonia, NC-SC MSA",16740,1,"Overall, how has this business been affected b...",2,Moderate negative effect,41.4%,6.47%
"Phoenix-Mesa-Chandler, AZ MSA",38060,1,"Overall, how has this business been affected b...",1,Large negative effect,35.8%,3.07%
"Phoenix-Mesa-Chandler, AZ MSA",38060,1,"Overall, how has this business been affected b...",2,Moderate negative effect,47.5%,2.75%
"Las Vegas-Henderson-Paradise, NV MSA",29820,1,"Overall, how has this business been affected b...",1,Large negative effect,47.0%,5.08%
"Las Vegas-Henderson-Paradise, NV MSA",29820,1,"Overall, how has this business been affected b...",2,Moderate negative effect,40.7%,3.89%
"Cleveland-Elyria, OH MSA",17460,1,"Overall, how has this business been affected b...",1,Large negative effect,47.1%,4.49%
"Cleveland-Elyria, OH MSA",17460,1,"Overall, how has this business been affected b...",2,Moderate negative effect,42.8%,5.14%


In [None]:
### put it together and plot it
for fl in msas:
    tmpdf = read_excel(fl).columns.str.lower()
    