In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from typing import List

MYDIR = "./../../ResearchProposal/bea_gov/gdp/"

print(os.path)
myFiles = os.listdir(MYDIR)
print(myFiles)

print("Done")

myFile = MYDIR + "gdp_by_county_2015_2018.xlsx"

<module 'ntpath' from 'C:\\Users\\alexg\\Anaconda3\\lib\\ntpath.py'>
['gdb_by_county_2015_2018.pdf', 'gdpind419.pdf', 'gdpind419.xlsx', 'gdp_by_county_2015_2018.pdf', 'gdp_by_county_2015_2018.xlsx', 'gdp_ready_to_analyze.csv', 'gdp_ready_to_analyze.hyper', '~$gdp_by_county_2015_2018.xlsx']
Done


In [2]:
"""
Create column for states.
The way to identify state is: it is the first row after NaN in "aggregation_level"
It is followed by all its counties.
Between the last county of the state and the next state, there is another NaN
"""
def setState(myDF: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """
    get state into a separate column
    """
    myDF["state"] = ""

    states = myDF[myDF.aggregation_level == "nan"]
    if verbose:
        print(states.index)
    for rr in list(states.index): #range((len(myDF)-1)):
        if verbose:
            print(f"""{rr}: {myDF.loc[rr + 1]["aggregation_level"]}""")
            
        myDF.at[rr + 1, "state"] = myDF.loc[rr + 1]["aggregation_level"]

    for rr in range(1, (len(myDF)-1)):
        if rr - 1 in list(states.index):
            continue
        else:
            myDF.at[rr, "state"] = myDF.loc[rr-1, "state"]
    
    myDF.drop(states.index, inplace=True)
    
    myDF.at[0, "state"] = myDF.loc[0, "aggregation_level"]
    
    return myDF

"""
Split the GDP data from the gdpDF
"""
def splitOutGDPdata(gdpDF: pd.DataFrame,
                    verbose: bool = False) -> pd.DataFrame:

    
    cols1 = list(gdpDF.loc[0])
    cols2 = list(gdpDF.loc[1])
    cols3 = [str(cc).replace(".0", "") for cc in list(gdpDF.loc[2])]

    """
    We are going to handle this as two dataframes: gdp and percent change
    We are not interested in percent change; only get the gdp columns
    """
    cols = list(gdpDF.columns)
    rank_cols = [cc for cc in range(len(cols2)) if "Rank" in str(cols2[cc]) ]
    rank_cols

    gdp_cols = cols[:rank_cols[0]]

    if verbose:
        print(cols1)
        print(cols2)
        print(cols3)
        print(gdp_cols)

    """
    Separate gdp from percent change
    """
    gdp = gdpDF[gdp_cols].copy()

    
    """
    Handle column names
    """
    gdp_col_names = [str(cols1[cc]) + "_" + str(cols2[cc]) + "_" + str(cols3[cc]) for cc in range(len(gdp_cols))]
    gdp.columns = gdp_col_names
    gdp.drop(range(3), inplace=True)
    gdp.rename(columns={"nan_nan_nan": "aggregation_level", 
                        "Real Gross Domestic Product_Thousands of chained (2012) dollars_2015": "nan_nan_2015"},
               inplace=True)
    gdp.columns = [cc.replace("nan_nan_", "real_gdp_2012usd_") for cc in gdp.columns]
    
    """
    Set type for "aggregation_level" to "str"
    """
    gdp["aggregation_level"] = gdp["aggregation_level"].astype("str")
    gdp.reset_index(drop=True, inplace=True)
        
    """
    get state into a separate column
    """
    gdpfinal = setState(gdp.copy(), verbose=verbose)
    gdpfinal.dropna(subset=["real_gdp_2012usd_2015"], inplace=True)

    """
    get log of the gdp numbers
    """
    for cc in gdpfinal.columns:
        if "real_gdp_" in cc:
            print(cc)
            gdpfinal[cc] = gdpfinal[cc].astype("float64")
            gdpfinal[cc + "_log"] = np.log10(gdpfinal[cc])


    return gdpfinal

In [3]:
gdpDF = pd.read_excel(myFile, sheet_name="Table 1", na_values="", keep_default_na=False)

num_rows = len(gdpDF)
print(num_rows)
gdpDF.head()

cols = list(gdpDF.columns)
print(cols[0])
gdpDF.rename(columns={cols[0]: "aggregation_level"}, inplace=True)
gdpDF.head()

gdpDF.tail()

3222
Table 1. Real Gross Domestic Product by County, 2015 - 2018


Unnamed: 0,aggregation_level,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
3217,Uinta,981530.0,938876.0,904458.0,906587.0,13.0,-4.3,-3.7,0.2,16.0
3218,Washakie,405382.0,366053.0,353860.0,358104.0,19.0,-9.7,-3.3,1.2,13.0
3219,Weston,387259.0,321664.0,306033.0,315885.0,20.0,-16.9,-4.9,3.2,5.0
3220,1. Virginia combination areas consist of one o...,,,,,,,,,
3221,Source: U.S. Bureau of Economic Analysis,,,,,,,,,


In [4]:
gdpDF.head()

Unnamed: 0,aggregation_level,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,Real Gross Domestic Product,,,,,Percent change from preceding period,,,
1,,Thousands of chained (2012) dollars,,,,Rank in State,Percent change,,,Rank in State
2,,2015,2016.0,2017.0,2018.0,2018,2016,2017.0,2018.0,2018
3,United States,17403843000,17688890000.0,18108080000.0,18638160000.0,--,1.6,2.4,2.9,--
4,,,,,,,,,,


In [5]:
gdpFinal = splitOutGDPdata(gdpDF, verbose=False)

real_gdp_2012usd_2015
real_gdp_2012usd_2016
real_gdp_2012usd_2017
real_gdp_2012usd_2018


In [6]:
gdpFinal.tail()

Unnamed: 0,aggregation_level,real_gdp_2012usd_2015,real_gdp_2012usd_2016,real_gdp_2012usd_2017,real_gdp_2012usd_2018,state,real_gdp_2012usd_2015_log,real_gdp_2012usd_2016_log,real_gdp_2012usd_2017_log,real_gdp_2012usd_2018_log
3212,Sweetwater,4037540.0,3961894.0,3959441.0,3836603.0,Wyoming,6.606117,6.597903,6.597634,6.583947
3213,Teton,1980818.0,2019964.0,2078477.0,2166420.0,Wyoming,6.296845,6.305344,6.317745,6.335743
3214,Uinta,981530.0,938876.0,904458.0,906587.0,Wyoming,5.991904,5.972608,5.956388,5.957409
3215,Washakie,405382.0,366053.0,353860.0,358104.0,Wyoming,5.607864,5.563544,5.548831,5.554009
3216,Weston,387259.0,321664.0,306033.0,315885.0,Wyoming,5.588002,5.507402,5.485768,5.499529


In [7]:
gdpFinal.head()

Unnamed: 0,aggregation_level,real_gdp_2012usd_2015,real_gdp_2012usd_2016,real_gdp_2012usd_2017,real_gdp_2012usd_2018,state,real_gdp_2012usd_2015_log,real_gdp_2012usd_2016_log,real_gdp_2012usd_2017_log,real_gdp_2012usd_2018_log
0,United States,17403840000.0,17688890000.0,18108080000.0,18638160000.0,United States,10.240645,10.247701,10.257872,10.270403
2,Alabama,189339100.0,190703000.0,193023900.0,198435900.0,Alabama,8.27724,8.280358,8.285611,8.29762
3,Autauga,1518409.0,1526310.0,1422078.0,1483414.0,Alabama,6.181389,6.183643,6.152923,6.171262
4,Baldwin,5345363.0,5496105.0,5525882.0,5774289.0,Alabama,6.727977,6.740055,6.742402,6.761499
5,Barbour,756590.0,742810.0,745625.0,787425.0,Alabama,5.878861,5.870878,5.87252,5.896209


In [8]:
gdpCSV = MYDIR + "gdp_ready_to_analyze.csv"
gdpFinal.to_csv(gdpCSV, index=False)
print(f"""Saved gdpFinal ({len(gdpFinal)} rows) to {gdpCSV}""")

Saved gdpFinal (3164 rows) to ./../../ResearchProposal/bea_gov/gdp/gdp_ready_to_analyze.csv


In [9]:
# def plotData(myEd: pd.DataFrame, logY: bool = False):
    
#     for gg in myEd.aggr_cat.unique():
#         print(gg)
#         ggEd = myEd.loc[dfEd.aggr_cat == gg].copy()
#         ggEd.reset_index(drop=True, inplace=True)
#         ax = ggEd.plot(x="category", lw=2.0, figsize=(15, 8))
#         ax.set_title(gg, fontsize=14)
#         ax.set_xticks(ggEd.index)
#         ax.set_xticklabels(list(ggEd.category), rotation=45, fontsize=14)
#         ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, 
#                                                              loc: 
#                                                              "{:,}".format(int(x))))
#         if logY:
#             ax.set_yscale("log")
#             ax.set_ylabel("(Log scale)")
    
#         plt.yticks(fontsize=14)
#         plt.legend(fontsize=14)
#         plt.show()