# 01) IMPORT RAW DATA

In [1]:
import math
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as sm

In [2]:
# lookup table: NAWQA Wall-to-Wall Anthropogenic Landuse Trends (NWALT) fields (from http://bit.ly/F7XW4J1JDataset8)
strNWALT = 'F7XW4J1J (Anthropogenic Influences)/NWALT.csv'

In [3]:
# import lookup table
dfNWALT = pd.read_csv(strNWALT)

In [4]:
# define paths to source files (from http://bit.ly/F7XW4J1JDataset8)
strLand = 'F7XW4J1J (Anthropogenic Influences)/Land Use Change, 1982-2012.csv'
strPH = 'Change in Mean pH Levels 82-12.csv'

In [5]:
# import data
dfLand = pd.read_csv(strLand)
dfPH = pd.read_csv(strPH)

In [6]:
# examine Land dataframe
dfLand.head()

Unnamed: 0,GEOID5,Chg-Water,Chg-Wetlands,Chg-Dev-Trans,Chg-Dev-CommSvcs,Chg-Dev-IndMil,Chg-Dev-Recr,Chg-Dev-ResHi,Chg-Dev-ResLoMed,Chg-Dev-Other,...,Chg-SemiDev-UrbIntLoMed,Chg-SemiDev-Other,Chg-Mining,Chg-Crops,Chg-Pasture,Chg-Grazing1,Chg-Grazing2,Chg-LowUse,Chg-VeryLowUse,Chg-Unknown
0,4019,0.0,0.0,0.02,0.11,0.03,0.1,0.31,0.73,-0.08,...,2.08,0.0,0.03,-0.01,0.0,0.0,-0.18,-9.62,6.87,0.0
1,4017,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.06,-0.01,...,0.37,0.0,0.0,-0.01,0.01,0.0,-0.03,-0.42,0.0,0.0
2,4012,0.0,0.0,0.02,0.0,0.0,0.0,0.03,0.02,0.0,...,0.22,0.0,0.0,-0.06,0.07,0.0,-0.3,-10.82,10.8,0.0
3,4007,0.03,0.0,0.02,0.01,0.0,0.02,0.02,0.05,-0.01,...,0.15,0.0,-0.02,0.01,-0.02,0.0,0.0,-3.74,3.47,0.0
4,4027,0.0,0.0,0.03,0.07,0.03,0.03,0.23,0.11,-0.03,...,0.28,0.0,0.0,0.62,-0.37,-0.02,-0.17,-27.16,26.34,0.0


In [7]:
# examine pH dataframe
dfPH.head()

Unnamed: 0,FIPS Code,1982 Mean pH Value,% Outside of Ideal (7.25),2012 Mean pH Value,% Outside of Ideal (7.25).1,% Difference,Change in % Points in Difference from Ideal
0,13065,4.082609,43.7%,3.927763,45.8%,-3.9%,2.1%
1,12003,5.422222,25.2%,4.393682,39.4%,-23.4%,14.2%
2,28073,5.71,21.2%,4.92,32.1%,-16.1%,10.9%
3,28035,6.5,10.3%,5.2,28.3%,-25.0%,17.9%
4,13179,7.49,-3.3%,5.527315,23.8%,-35.5%,27.1%


# 02) REFORMAT DATA

In [8]:
# rename dfLand's "GEOID5" col for merging
dictCol = {'GEOID5':'FIPS Code'}
for col in dfLand.filter(regex='Chg').columns:
    dictCol[col]=col.replace('-','') 
dfLand = dfLand.rename(columns=dictCol)
dfLand.head()

Unnamed: 0,FIPS Code,ChgWater,ChgWetlands,ChgDevTrans,ChgDevCommSvcs,ChgDevIndMil,ChgDevRecr,ChgDevResHi,ChgDevResLoMed,ChgDevOther,...,ChgSemiDevUrbIntLoMed,ChgSemiDevOther,ChgMining,ChgCrops,ChgPasture,ChgGrazing1,ChgGrazing2,ChgLowUse,ChgVeryLowUse,ChgUnknown
0,4019,0.0,0.0,0.02,0.11,0.03,0.1,0.31,0.73,-0.08,...,2.08,0.0,0.03,-0.01,0.0,0.0,-0.18,-9.62,6.87,0.0
1,4017,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.06,-0.01,...,0.37,0.0,0.0,-0.01,0.01,0.0,-0.03,-0.42,0.0,0.0
2,4012,0.0,0.0,0.02,0.0,0.0,0.0,0.03,0.02,0.0,...,0.22,0.0,0.0,-0.06,0.07,0.0,-0.3,-10.82,10.8,0.0
3,4007,0.03,0.0,0.02,0.01,0.0,0.02,0.02,0.05,-0.01,...,0.15,0.0,-0.02,0.01,-0.02,0.0,0.0,-3.74,3.47,0.0
4,4027,0.0,0.0,0.03,0.07,0.03,0.03,0.23,0.11,-0.03,...,0.28,0.0,0.0,0.62,-0.37,-0.02,-0.17,-27.16,26.34,0.0


In [9]:
# create numeric "pH Improvement" column
dfPH['ChgPHQual'] = abs(7.25 - dfPH['1982 Mean pH Value']) - abs(7.25 - dfPH['2012 Mean pH Value'])
dfPH.head()

Unnamed: 0,FIPS Code,1982 Mean pH Value,% Outside of Ideal (7.25),2012 Mean pH Value,% Outside of Ideal (7.25).1,% Difference,Change in % Points in Difference from Ideal,ChgPHQual
0,13065,4.082609,43.7%,3.927763,45.8%,-3.9%,2.1%,-0.154846
1,12003,5.422222,25.2%,4.393682,39.4%,-23.4%,14.2%,-1.02854
2,28073,5.71,21.2%,4.92,32.1%,-16.1%,10.9%,-0.79
3,28035,6.5,10.3%,5.2,28.3%,-25.0%,17.9%,-1.3
4,13179,7.49,-3.3%,5.527315,23.8%,-35.5%,27.1%,-1.482685


In [10]:
dfMrg = pd.merge(dfLand, dfPH, on='FIPS Code')
dfMrg.head()

Unnamed: 0,FIPS Code,ChgWater,ChgWetlands,ChgDevTrans,ChgDevCommSvcs,ChgDevIndMil,ChgDevRecr,ChgDevResHi,ChgDevResLoMed,ChgDevOther,...,ChgLowUse,ChgVeryLowUse,ChgUnknown,1982 Mean pH Value,% Outside of Ideal (7.25),2012 Mean pH Value,% Outside of Ideal (7.25).1,% Difference,Change in % Points in Difference from Ideal,ChgPHQual
0,4019,0.0,0.0,0.02,0.11,0.03,0.1,0.31,0.73,-0.08,...,-9.62,6.87,0.0,7.702683,-6.2%,6.908015,4.7%,-11.5%,11.0%,0.110698
1,4017,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.06,-0.01,...,-0.42,0.0,0.0,8.691667,-19.9%,8.3,-14.5%,-4.7%,5.4%,0.391667
2,4012,0.0,0.0,0.02,0.0,0.0,0.0,0.03,0.02,0.0,...,-10.82,10.8,0.0,8.08,-11.4%,7.2748,-0.3%,-11.1%,11.1%,0.8052
3,4007,0.03,0.0,0.02,0.01,0.0,0.02,0.02,0.05,-0.01,...,-3.74,3.47,0.0,8.141752,-12.3%,8.173014,-12.7%,0.4%,-0.4%,-0.031262
4,4027,0.0,0.0,0.03,0.07,0.03,0.03,0.23,0.11,-0.03,...,-27.16,26.34,0.0,7.894585,-8.9%,7.697018,-6.2%,-2.6%,2.7%,0.197567


In [11]:
# describe new dataframe
dfMrg.describe()

Unnamed: 0,FIPS Code,ChgWater,ChgWetlands,ChgDevTrans,ChgDevCommSvcs,ChgDevIndMil,ChgDevRecr,ChgDevResHi,ChgDevResLoMed,ChgDevOther,...,ChgCrops,ChgPasture,ChgGrazing1,ChgGrazing2,ChgLowUse,ChgVeryLowUse,ChgUnknown,1982 Mean pH Value,2012 Mean pH Value,ChgPHQual
count,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,...,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0,1390.0
mean,28470.271223,0.042727,0.0,0.119165,0.217683,0.104245,0.111698,0.40036,0.806885,-0.194698,...,-0.413043,-0.847014,0.051489,-0.010007,-2.995583,0.507791,0.0,7.464941,7.647349,-0.018752
std,15694.842378,0.222042,0.0,0.234975,0.515088,0.206504,0.236787,1.153719,1.795725,0.411671,...,1.79174,1.74478,0.271467,0.627258,6.032287,2.76659,0.0,0.804258,0.596387,0.683955
min,1001.0,-0.07,0.0,0.0,0.0,0.0,0.0,-0.2,-4.11,-4.18,...,-10.52,-12.28,-3.57,-9.87,-56.42,-0.01,0.0,2.2,3.927763,-1.697755
25%,17004.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.01,-0.23,...,-0.99,-1.48,-0.01,-0.04,-3.8075,0.0,0.0,7.139174,7.338573,-0.36089
50%,29022.0,0.0,0.0,0.07,0.04,0.02,0.02,0.01,0.11,-0.01,...,-0.12,-0.405,0.01,0.0,-1.08,0.0,0.0,7.6,7.756498,-0.054157
75%,40116.5,0.02,0.0,0.14,0.17,0.1075,0.1,0.16,0.7675,0.01,...,0.32,0.01,0.09,0.08,-0.0925,0.0,0.0,7.915506,8.061421,0.237573
max,56041.0,5.5,0.0,7.12,6.94,1.88,2.61,10.21,25.78,0.53,...,7.32,6.99,2.74,2.67,7.66,56.35,0.0,21.162361,9.029167,13.731488


# 03) COMPUTE LAND USE-TO-pH CORRELATIONS

In [12]:
# create format strings to prettify output
strFmt = '{:,.3f}'
strFmtSignSpc = '{:+1,.3f}'
strFmtSpc = '{: ,.3f}'
# compute regressions against each var
for col in dfMrg.filter(regex='Chg').columns:
    slope, intercept, r_value, p_value, std_err=stats.linregress(dfMrg[col], dfMrg['ChgPHQual'])
    strMX = f'{strFmtSpc.format(slope)} × {col.ljust(21)}'
    strB = f'{strFmtSignSpc.format(intercept)}'
    strPAndR2 = f'P = {strFmt.format(p_value)}, R² = {strFmtSignSpc.format(r_value**2)}'
    if not math.isnan(slope):
        print(f'ChgPHQual = {strB} + {strMX} | {strPAndR2}')

ChgPHQual = -0.020 +  0.033 × ChgWater              | P = 0.689, R² = +0.000
ChgPHQual = -0.032 +  0.114 × ChgDevTrans           | P = 0.143, R² = +0.002
ChgPHQual = -0.037 +  0.085 × ChgDevCommSvcs        | P = 0.018, R² = +0.004
ChgPHQual = -0.039 +  0.191 × ChgDevIndMil          | P = 0.031, R² = +0.003
ChgPHQual = -0.042 +  0.205 × ChgDevRecr            | P = 0.008, R² = +0.005
ChgPHQual = -0.047 +  0.071 × ChgDevResHi           | P = 0.000, R² = +0.014
ChgPHQual = -0.033 +  0.018 × ChgDevResLoMed        | P = 0.078, R² = +0.002
ChgPHQual = -0.049 + -0.158 × ChgDevOther           | P = 0.000, R² = +0.009
ChgPHQual = -0.034 + -0.069 × ChgSemiDevUrbIntHi    | P = 0.005, R² = +0.006
ChgPHQual = -0.045 +  0.012 × ChgSemiDevUrbIntLoMed | P = 0.002, R² = +0.007
ChgPHQual = -0.026 + -1.488 × ChgSemiDevOther       | P = 0.220, R² = +0.001
ChgPHQual = -0.022 +  0.103 × ChgMining             | P = 0.514, R² = +0.000
ChgPHQual = -0.031 + -0.029 × ChgCrops              | P = 0.004, R² = +0.006

  slope = r_num / ssxm
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


In [19]:
result = sm.ols(formula='ChgPHQual ~ ChgWater + ChgDevTrans + ChgDevCommSvcs + ChgDevIndMil + ChgDevRecr + ChgDevResHi + ChgDevResLoMed + ChgDevOther + ChgSemiDevUrbIntHi + ChgSemiDevUrbIntLoMed + ChgSemiDevOther + ChgMining + ChgCrops + ChgPasture + ChgGrazing1 + ChgGrazing2 + ChgLowUse + ChgVeryLowUse', data=dfMrg).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:              ChgPHQual   R-squared:                       0.046
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     3.705
Date:                Mon, 01 Jul 2019   Prob (F-statistic):           2.57e-07
Time:                        20:05:41   Log-Likelihood:                -1410.8
No. Observations:                1390   AIC:                             2860.
Df Residuals:                    1371   BIC:                             2959.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -0.05

# 04) MAP WATER QUALITY BY COUNTY

In [15]:
# import plotly and set credentials
from PlotlyConfig import un, pkey 
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.io as pio
import numpy as np
plotly.tools.set_credentials_file(username=un, api_key=pkey)

In [16]:
def MakeCtyFig(strVal, strTitle='Water Quality', strLegend='', fRound=False, fBin=True, strFIPS='FIPS Code', df=dfPH):
    # create lists of parameter values
    lstFIPS = df[strFIPS].tolist()
    lstVal = df[strVal].tolist()
    if fRound:
        lstVal = [round(val, 2) for val in lstVal]
    lstColor = ['#FF0040','#FF0000','#FF2800','#FF5000','#FF7800', \
                '#FFa000','#FFc800','#FFf000','#b0ff00','#17ff00', \
                '#00ff83','#00e4ff','#00a4ff','#0064ff','#0022ff', \
                '#0100ff','#0500ff'] # 13-color ROYGB
    intColor = len(lstColor)
    intBinSize = 1 / (intColor + 1) * 100
    lstBin = list(np.linspace(np.percentile(lstVal, intBinSize), \
                              np.percentile(lstVal, 100 - intBinSize), \
                              intColor - 1))
    # set fig variable
    if fBin:
        fig = ff.create_choropleth(
            fips=lstFIPS, values=lstVal,
            binning_endpoints=lstBin,
            colorscale=lstColor,
            show_state_data=False,
            show_hover=True, centroid_marker={'opacity': 0},
            asp=2.9, title=strTitle,
            legend_title=strLegend
        )
    else:
        fig = ff.create_choropleth(
            fips=lstFIPS, values=lstVal,
            title=strTitle,
            legend_title=strLegend
        )
        
    return fig

In [17]:
fig = MakeCtyFig('ChgPHQual', 'Progress toward Ideal pH (7.25), 1982-2012', 'Net pH Improvement', True)
py.iplot(fig, filename='choropleth_full_usa')


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Consider using IPython.display.IFrame instead



In [18]:
pio.write_image(fig, 'Station_List/pH by County 1982-2012.png')