Bringing together GDP and Personal Income by US County

In [6]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid
from sklearn import decomposition
import statsmodels.regression.linear_model as lm
from sklearn.ensemble import RandomForestRegressor 


from typing import List, Tuple, Dict

MYDIR = "./../../ResearchProposal/"

# print(os.path)
myFiles = os.listdir(MYDIR)
# print(myFiles)

gdpFile = MYDIR + "bea_gov/gdp/gdp_ready_to_analyze.csv"
piFile = MYDIR + "bea_gov/personal_income/personal_income_ready_to_analyze.csv"
hhiFile = MYDIR + "income_inequality/census_income_by_county/hh_income__census_data.csv"
populationFile = MYDIR + "population_dynamics/census_population_data_2010_2019.csv"
suicideFile = MYDIR + "suicide/multiple_causes_of_death__suicide.csv"
employmentFile = MYDIR + "unemployment/employment_by_county_state_year.csv"
stateAbbrevFile = MYDIR + "state_abbreviations.csv"
myfiles = {"gdp": gdpFile,
           "pi": piFile,
           "hhi": hhiFile,
           "pop": populationFile,
           "sc": suicideFile,
           "emp": employmentFile,
          }

In [4]:
class mobilityHelpers:
    def __init__(self, 
                 myFiles: Dict[str, str] = myfiles,
                 ctv_cutoff: float = 0.055):
        self.mCTVcutoff = ctv_cutoff
        self.mFiles = myfiles
        
        self.mStateAbbreviationsDF = pd.read_csv(stateAbbrevFile)
        
    
    def getPopulationDynamicsData(self, verbose: bool = False) -> Tuple[pd.DataFrame]:
        
        popDFraw = pd.read_csv(self.mFiles["pop"], encoding = "ISO-8859-1")
        cols = list(popDFraw.columns)
        cols2keep = [cc for cc in cols if \
                          (cc not in ("SUMLEV", "REGION", "DIVISION", "STATE", "COUNTY")) and \
                          ('GQESTIMATE' not in cc) and \
                          ('RESIDUAL' not in cc)]
        cols2keep
        popDFraw = popDFraw[cols2keep].copy()
        popDF = popDFraw[popDFraw["CTYNAME"] != popDFraw["STNAME"]].copy()
        popDF["county"] = popDF["CTYNAME"].str.replace(" County", "")
        popDF["county_state"] = popDF["county"] + ", " + popDF["STNAME"]
        popDF.rename(columns={"STNAME": "state"}, inplace=True)
        del popDF["CTYNAME"]

        popDFlong = self.meltAndReplaceYear(popDF, 
                                            ["state", "county", "county_state"],
                                            verbose=verbose
                                           )
                                           
        return popDF, popDFlong
    
    def getEmpData(self, verbose: bool = False) -> Tuple[pd.DataFrame]:
        if verbose:
            print(f"""Getting employment data from {employmentFile}""")
        empDFRaw = pd.read_csv(employmentFile)
        empDFRaw[["county", "st"]] = empDFRaw["county_state"].str.split(", ", 
                                                                        expand=True)
        
        empDFRaw.rename(columns={"county_state": "county_st"}, inplace=True)
        empDF = empDFRaw.merge(self.mStateAbbreviationsDF, on="st")

        empDF["county_state"] = empDF["county"] + ", " + empDF["state"]
        del empDF["county_st"]
        del empDF["st"]
        
        empDFlong = empDF.melt(id_vars=["county", "state", "county_state", "year"])
        empDFlong.rename(columns={"variable": "metric"}, inplace=True)
        
        empDFlong["metric"] = empDFlong.metric.str.upper()

        empDFlong["year"] = empDFlong["year"].astype("int64")
        
        return empDF, empDFlong
    
    def getGDPdata(self, verbose: bool = False) -> Tuple[pd.DataFrame]:
        gdpDFraw = pd.read_csv(gdpFile)
        cols = list(gdpDFraw.columns)
        for ii in range(len(gdpDFraw.columns)):
            cc = cols[ii]
            cols[ii] = cc.replace("real_gdp_2012usd_", "gdp")
        gdpDFraw.columns = cols
        gdpDF = gdpDFraw[gdpDFraw["aggregation_level"] != gdpDFraw["state"]]
        
        gdpDFlong = self.meltAndReplaceYear(gdpDF,
                                            idVars=["aggregation_level", "state"],
                                            verbose=verbose
                                           )
        gdpDFlong.rename(columns={"aggregation_level": "county"}, inplace=True)
        gdpDFlong["county_state"] = gdpDFlong["county"] + ", " + gdpDFlong["state"]
        gdpDFlong["metric"] = gdpDFlong["metric"].str.upper()
        
        return gdpDF, gdpDFlong
    
    def getHouseholdIncomeData(self) -> Tuple[pd.DataFrame]:
        hhiDF = pd.read_csv(hhiFile)[["county", "state", "county_state", "year",
                              "mean_to_median_household_income_ratio",
                              "Median_income__dollars", "Mean_income__dollars"]]
        hhiDF.columns = [cc.lower() for cc in hhiDF.columns]
        
        hhiDFlong = self.meltAndReplaceYear(hhiDF, ["county", "state", "county_state", "year"])
        
        hhiDFlong["year"] = hhiDFlong["year"].astype("int64")
        return hhiDF, hhiDFlong
    
    def getSuicideRateData(self) -> Tuple[pd.DataFrame]:
        
        scDF = pd.read_csv(suicideFile)[["state", "county", "county_state", "year",
                                         "age_adjusted_rate", "population", "deaths", "death_rate"]]
        scDF.columns = [cc.lower() for cc in scDF.columns]
        scDF.rename(columns={"age_adjusted_rate": "SCRATE"}, inplace=True)
        scDF.rename(columns={"population": "POP_CDC"}, inplace=True)
        scDF.rename(columns={"deaths": "SCDEATHS"}, inplace=True)
        scDF.rename(columns={"death_rate": "SC_R_DEATH"}, inplace=True)
        scDFlong = scDF.melt(id_vars=["county", "state", "county_state", "year"])
        
        scDFlong.rename(columns={"variable": "metric"}, inplace=True)

        scDFlong["year"] = scDFlong["year"].astype("int64")
        
        return scDF, scDFlong
    
    
    def meltAndReplaceYear(self, 
                           myDF: pd.DataFrame,
                           idVars: List[str],
                           verbose: bool = False,
                          ) -> pd.DataFrame:
        
        myDFlong = myDF.melt(id_vars = idVars)
        if verbose:
            print(myDFlong.head())

        if "year" not in idVars:
            myDFlong["year"] = myDFlong["variable"].str.replace("\D+", "").astype("int64")
        
        myDFlong["metric"] = myDFlong["variable"].str.replace("\d+", "")

        del myDFlong["variable"]
    
        return myDFlong  

    """Copied from https://stackoverflow.com/questions/7404116/defining-the-midpoint-of-a-colormap-in-matplotlib"""
    def shiftedColorMap(self, cmap, 
                        start=0, midpoint=0.5, stop=1.0, name='shiftedcmap'):
        '''
        Function to offset the "center" of a colormap. Useful for
        data with a negative min and positive max and you want the
        middle of the colormap's dynamic range to be at zero.

        Input
        -----
          cmap : The matplotlib colormap to be altered
          start : Offset from lowest point in the colormap's range.
              Defaults to 0.0 (no lower offset). Should be between
              0.0 and `midpoint`.
          midpoint : The new center of the colormap. Defaults to 
              0.5 (no shift). Should be between 0.0 and 1.0. In
              general, this should be  1 - vmax / (vmax + abs(vmin))
              For example if your data range from -15.0 to +5.0 and
              you want the center of the colormap at 0.0, `midpoint`
              should be set to  1 - 5/(5 + 15)) or 0.75
          stop : Offset from highest point in the colormap's range.
              Defaults to 1.0 (no upper offset). Should be between
              `midpoint` and 1.0.
        '''
        cdict = {
            'red': [],
            'green': [],
            'blue': [],
            'alpha': []
        }

        # regular index to compute the colors
        reg_index = np.linspace(start, stop, 257)

        # shifted index to match the data
        shift_index = np.hstack([
            np.linspace(0.0, midpoint, 128, endpoint=False), 
            np.linspace(midpoint, 1.0, 129, endpoint=True)
        ])

        for ri, si in zip(reg_index, shift_index):
            r, g, b, a = cmap(ri)

            cdict['red'].append((si, r, r))
            cdict['green'].append((si, g, g))
            cdict['blue'].append((si, b, b))
            cdict['alpha'].append((si, a, a))

        newcmap = matplotlib.colors.LinearSegmentedColormap(name, cdict)
        plt.register_cmap(cmap=newcmap)

        return newcmap
    
    def plotCorrMatrix(self, myDF: pd.DataFrame, title: str, method: str = "pearson"):
        f = plt.figure(figsize=(15, 12))
        orig_cmap = matplotlib.cm.PuOr
        shifted_cmap = self.shiftedColorMap(orig_cmap, 
                                       start=-1.0,
                                       midpoint=0.0,
                                       stop=1.0,
                                       name='shifted')
        ax = plt.matshow(myDF.corr(method=method), cmap=shifted_cmap, fignum=f.number)
        plt.tick_params(labelsize=14)
        cb = plt.colorbar()
        cb.ax.tick_params(labelsize=14)
        plt.title(title, fontsize=16);
        plt.show()
    
    def plotGivenCorrMatrix(self, corrMatrix: pd.DataFrame, title: str, method: str = "pearson"):
        f = plt.figure(figsize=(15, 12))
        orig_cmap = matplotlib.cm.PuOr
        shifted_cmap = self.shiftedColorMap(orig_cmap, 
                                       start=-1.0,
                                       midpoint=0.0,
                                       stop=1.0,
                                       name='shifted')
        ax = plt.matshow(corrMatrix, cmap=shifted_cmap, fignum=f.number)
        plt.tick_params(labelsize=14)
        cb = plt.colorbar()
        cb.ax.tick_params(labelsize=14)
        plt.title(title, fontsize=16);
        plt.show()
    
    def plot2D(self, 
               gdpidData: pd.DataFrame, year: int,
               xVar_base: str,
               yVar_base: str,
               ):

        yrData = gdpidData.loc[gdpidData["year"] == year].copy()
        xVar = xVar_base + str(year)
        yVar = yVar_base + str(year)
        if yVar_base == "mean_to_median_household_income_ratio":
            yVar = yVar_base
        #     ax = gdpidData.plot.scatter (x="real_gdp_2012usd_" + year, y="income_usd_" + year, figsize=(10, 8))
        ax = yrData.plot.scatter (x=xVar, 
                                  y=yVar, figsize=(10, 8))

        ax.set_xscale("log")

        if yVar_base != "mean_to_median_household_income_ratio":
            ax.set_yscale("log")
            ax.set_title("Personal Income and " + xVar_base + " for " + str(year), fontsize=14)
            ax.set_ylabel("Mean to Median Income Ratio", fontsize=14)
        else:
            ax.set_title("Income Disparity and " + xVar_base + " for " + str(year), fontsize=14)
            ax.set_ylabel("Mean to Median Income Ratio", fontsize=14)

        ax.set_xlabel(xVar, fontsize=14)

        plt.show()
        
        
    def analyzeCorr4Migrations(self, 
                               tgt_correlated_features: pd.DataFrame, 
                               what: str = "NETMIG"):
        myCorr = \
        tgt_correlated_features.loc[tgt_correlated_features.feature_2 == \
                                    what].copy().reset_index(drop=True)

        print("\n-------------------------------------------")
        print(what + " into US counties is negatively correlated with:\n")
        for ff in list(myCorr.sort_values("correlation").loc[myCorr["correlation"] < 0].feature_1):
            print(ff)

        print("\n-------------------------------------------")
        print(what + " into US counties is positively correlated with:\n")
        for ff in list(myCorr.sort_values("correlation").loc[myCorr["correlation"] > 0].feature_1):
            print(ff)

        return myCorr.sort_values("correlation").reset_index(drop=True)
    
    """
    Details in 
    https://stackoverflow.com/questions/31909945/obtain-eigen-values-and-vectors-from-sklearn-pca/31941631#31941631
    
    Apply PCA; select the primary principal components:
    (a) outliers in eigenvalues (default);
    (b) PCs with eigenvalues greater than a given tolerance % of max eigenvalue; 
    (c) or  with the lowest frequency on the histogram.
    
    Note: the outlier-based selection will only work if we have more than 10-15 PCs.  
    This method is sensitive to number of PCs.
    """
    def applyPCA(self, 
                 gdpiData: pd.DataFrame,
                 cols: List[str],
                 n_comps: int,
                 identify_outliers: bool = True,
                 beta_outliers: float = 1.5,
                 tolerance: float = -1.0,
                 remove_most_frequent: bool = False,
                 
                 verbose: bool = False,
                 showplots: bool = False) -> Tuple[pd.DataFrame, List[float]]:


        data = myDF[cols]

        if verbose:
            print(f"""We have {len(cols)} population-dynamics variables """
              f"""to use for modeling net migration into each county""")

            print(f"""We are going to reduce it by identifying the"""
                  f"""primary components.  We will start with {n_comps}""")
        
        """Normalize by StDev:"""
        data /= np.std(data, axis=0)

        pcaMdl = decomposition.PCA(n_components=n_comps)
        pcaMdl.fit(data)

        if verbose:
            print("Eigenvectors:")
            print(pcaMdl.components_)
            print("Eigenvalues:")
            print(pcaMdl.explained_variance_)

        eigenvalues = pcaMdl.explained_variance_
        transformed = pcaMdl.transform(data)

        for cc in range(n_comps):
            myDF["pc_" + str(cc)] = [transformed[ii][cc] for ii in range(len(transformed))]

        """Identify Outliers in Eigenvalues"""
        if identify_outliers:
            print(f"""Of these {n_comps}, we will select """
                  f"""the EigenValue outliers, to avoid overfitting the model""")
            ax = plt.subplot()
            ax.boxplot(eigenvalues, vert=False)
            ax.set_title("Eigenvalues of the principal components", fontsize=14)
            plt.show()
            
            """
            Select primary components as outliers in Eigenvalues
            """
            qrtls = np.percentile(eigenvalues, q=(25.0, 75.0))
            qrtls
            iqr = max(qrtls) - min(qrtls)
            high_bound = max(qrtls) + beta_outliers * iqr
            high_bound

            primary_components = [ee for ee in eigenvalues if ee >= high_bound]
            print(f"""In this case, primary components are ones """
                  f"""with eigenvalues >= {high_bound}""")
        else:
            primary_components = [ee for ee in eigenvalues if ee >= max(qrtls)]
            print(f"""In this case, primary components are ones """
                  f"""with eigenvalues >= {max(qrtls)}""")
             
        print(f"""We have {len(primary_components)} primary principal components:""")
        for ii in range(len(primary_components)):
            print(f"""{ii}: {primary_components[ii]:.3f}""")
            
        if tolerance > 0:
            maxPC = max(eigenvalues)
            print(f"""Removing PCs with eigenvalues < {(100.0*tolerance): .3f}% of {maxPC}""")
            primary_components = [pc for pc in eigenvalues if pc >= tolerance * maxPC]
            
        if remove_most_frequent:
            ax = plt.subplot()
            myHist = ax.hist(eigenvalues)
            if verbose:
                print(myHist[0])
                print(myHist[1])
            counts = list(myHist[0])
            maxCountIndex = counts.index(max(counts))
            maxCountEigenvalue = myHist[1][maxCountIndex]
            
            primary_components = [pp for pp in eigenvalues if pp > maxCountEigenvalue]
            
        return myDF, list(primary_components)
    
    """
    Apply linear regression
    """
    def applyLinRegr(self, 
                     pcPopDF: pd.DataFrame, 
                     year: int,
                     primary_components: List[float],
                     verbose: bool = True,
                     showplots: bool = True,
                    ) -> pd.DataFrame:
        print(year)

        yVar = "NETMIG" + str(year)
        frmla = yVar + " ~ "
        frmla

        rhs = " + ".join(["pc_" + str(ii) for ii in range (len(primary_components))])
        rhs

        frmla += rhs
        frmla
        
        linMdl = lm.OLS.from_formula(formula=frmla, data = pcPopDF)
        res = linMdl.fit()

        if verbose:
            print(res.summary())
        
        pcPopDF[yVar + "_LM"] = res.predict()
        
        if showplots:
            ax = pcPopDF.plot.scatter(x=yVar, 
                                 y=yVar + "_LM", 
                                 figsize=(10, 6))
            ax.set_title(yVar + " and linear regression prediction",
                         fontsize=16
                        )
            ax.set_xlabel(yVar, fontsize=14)
            ax.set_ylabel(yVar + " model prediction", fontsize=14)
            plt.show()
            
        return pcPopDF
    
    """
    Fit random forest regression to get the importance of each of the primary components
    """
    def fitRFR(self, 
               pcPopDF: pd.DataFrame, 
               xVars: List[str] = [],
               yVarBase: str = "NETMIG",
               year: int = 2010,
               ctv_cutoff: float = 0.055,
               n_rfr_trees: int = 100,
               verbose: bool = False,
              ) -> Tuple[List[str], pd.DataFrame]:
        
        # create regressor object 
        regressor = RandomForestRegressor(n_estimators=n_rfr_trees, random_state = 0) 

        # fit the regressor with x and y data 
        if yVarBase is None or len(yVarBase) == 0:
            yVarBase = "NETMIG"
            
        yVar = yVarBase + str(year)
        
        x = pcPopDF[xVars]
        y = pcPopDF[yVar]
        rfr = regressor.fit(x, y) 
    
        pcPopDF[yVar + "_rfr_predict"] = regressor.predict(pcPopDF[xVars])
        importances = list(rfr.feature_importances_)
        importances

        important_features = \
            [ii for ii in range(len(importances)) if importances[ii] > ctv_cutoff]

        importantXs = [xVars[impfeat] for impfeat in important_features]
        
        if verbose:
            print(f"""Feature Importances (contributions to variance):""")

            """sort xVars in order of importances"""
            ctvDict = {} # Dict[str: int]
            for ii in range(len(importances)):
                ctvDict[xVars[ii]] = importances[ii]
                
            ctvlist = sorted(ctvDict.items(), key=lambda x: x[1], reverse=True)
            for kk in ctvlist:
                print(f"""{kk[0]}: {kk[1]: .3f}""")
                
            print(f"""Identified important features (CTV cutoff = {ctv_cutoff})\n""")

            
            for kk in ctvlist:
                if kk[0] in importantXs:
                    print(f"""{kk[0]}: {kk[1]: .3f}""")
                    
            print(f"""\nRFR R^2 with identified features: """
                  f"""{sum([importances[ii] for ii in important_features]): .3f}""")

#         importantXs = ["pc_" + str(ii) for ii in important_features]
        
    
        return [xVars[impfeat] for impfeat in important_features], pcPopDF
    
    def applyLinearRegression(self,
                              pcPopDF: pd.DataFrame,
                              importantXs: List[str],
                              yVarBase: str = "NETMIG",
                              year: int = 2010,
                              verbose: bool = True,
                              showplots: bool = True,
                             ) -> pd.DataFrame:
        
        yVar = yVarBase + str(year)
        frmla = yVar + " ~ "
        print(frmla)

        rhs = " + ".join(importantXs)
        print(rhs)
        

        frmla += rhs
        print(frmla)

        verbose=True
        showplots=True

        linMdl = lm.OLS.from_formula(formula=frmla, data = pcPopDF)
        res = linMdl.fit()

        if verbose:
            print(res.summary())

        pcPopDF[yVar + "_LM"] = res.predict()

        if showplots:
            ax = pcPopDF.plot.scatter(x=yVar, 
                                 y=yVar + "_LM", 
                                 figsize=(10, 6))
            ax.set_title(yVar + " and linear regression prediction",
                         fontsize=16
                        )
            ax.set_xlabel(yVar, fontsize=14)
            ax.set_ylabel(yVar + " model prediction", fontsize=14)
            plt.show()

        