We use a linear regression on actual HPA values over a set of core geographical indices to model and predict HPA across the US, the 50 states & DC, and 384 core based statistical areas (as per the US Census Bureau). 

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
pd.options.mode.chained_assignment = None

Importing the actual data

In [2]:
hpi_statelevel = pd.read_csv('confhpi_ofheo_statelevel.csv', index_col=0)
hpi_statelevel.head()

Unnamed: 0,US,AK,AL,AR,AZ,CA,CO,CT,DC,DE,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
Jan-75,35.5,54.19,49.23,52.97,42.53,18.02,41.12,25.95,23.16,32.47,...,53.11,45.47,45.86,44.35,38.09,35.79,25.2,43.85,39.67,55.0
Feb-75,35.8,53.63,49.09,51.51,43.2,18.19,41.17,25.91,23.7,34.33,...,53.71,45.51,46.68,44.79,37.45,36.03,25.41,44.09,45.19,54.68
Mar-75,36.2,52.79,48.79,49.38,44.13,18.35,41.24,25.88,24.41,35.47,...,54.86,45.66,47.81,45.44,36.65,36.82,25.55,44.32,51.77,54.19
Apr-75,36.5,51.96,48.5,47.25,45.07,18.5,41.32,25.84,25.12,36.62,...,56.02,45.82,48.94,46.09,35.85,37.62,25.7,44.55,58.35,53.7
May-75,36.9,51.13,48.21,45.12,46.0,18.66,41.39,25.8,25.82,37.76,...,57.18,45.97,50.06,46.74,35.05,38.41,25.85,44.78,64.93,53.21


In [3]:
hpi_cbsalevel= pd.read_csv('confhpi_ofheo_cbsalevel.csv', index_col=0)
hpi_cbsalevel.head()

Unnamed: 0,10180,10420,10500,10580,10740,10780,10900,11020,11100,11180,...,48864,48900,49020,49180,49340,49420,49620,49660,49700,49740
,"Abilene, TX","Akron, OH","Albany, GA","Albany-Schenectady-Troy, NY","Albuquerque, NM","Alexandria, LA","Allentown-Bethlehem-Easton, PA-NJ","Altoona, PA","Amarillo, TX","Ames, IA",...,"Wilmington, DE-MD-NJ (MSAD)","Wilmington, NC","Winchester, VA-WV","Winston-Salem, NC","Worcester, MA","Yakima, WA","York-Hanover, PA","Youngstown-Warren-Boardman, OH-PA","Yuba City, CA","Yuma, AZ"
Jan-75,53.99,43.24,49.39,25.41,36.73,46.33,34.43,34.78,43.21,44.89,...,31.08,38.67,35.64,42.28,25.09,32.22,39.53,44.86,19.64,44.05
Feb-75,54.95,43.17,48.99,25.15,37.39,47.48,34.14,34.58,43.98,46.69,...,32.51,38.86,35.12,42.49,24.89,32.49,39.3,44.75,19.83,44.75
Mar-75,56.28,43.12,48.36,24.79,38.15,49.05,33.64,34.16,45.04,48.89,...,33.38,39.05,34.46,42.7,24.63,32.67,38.83,44.62,20,45.72
Apr-75,57.61,43.07,47.73,24.44,38.91,50.63,33.15,33.75,46.1,51.08,...,34.24,39.24,33.8,42.91,24.37,32.86,38.36,44.49,20.17,46.68


In [4]:
hpi_cbsalevel= pd.read_csv('confhpi_ofheo_cbsalevel.csv', index_col=0)
cbsa_key = hpi_cbsalevel.iloc[0]
hpi_cbsalevel.drop([np.NaN], axis=0, inplace = True)
hpi_cbsalevel = hpi_cbsalevel.astype(float)
hpi_cbsalevel.head()

Unnamed: 0,10180,10420,10500,10580,10740,10780,10900,11020,11100,11180,...,48864,48900,49020,49180,49340,49420,49620,49660,49700,49740
Jan-75,53.99,43.24,49.39,25.41,36.73,46.33,34.43,34.78,43.21,44.89,...,31.08,38.67,35.64,42.28,25.09,32.22,39.53,44.86,19.64,44.05
Feb-75,54.95,43.17,48.99,25.15,37.39,47.48,34.14,34.58,43.98,46.69,...,32.51,38.86,35.12,42.49,24.89,32.49,39.3,44.75,19.83,44.75
Mar-75,56.28,43.12,48.36,24.79,38.15,49.05,33.64,34.16,45.04,48.89,...,33.38,39.05,34.46,42.7,24.63,32.67,38.83,44.62,20.0,45.72
Apr-75,57.61,43.07,47.73,24.44,38.91,50.63,33.15,33.75,46.1,51.08,...,34.24,39.24,33.8,42.91,24.37,32.86,38.36,44.49,20.17,46.68
May-75,58.93,43.03,47.11,24.08,39.68,52.2,32.66,33.33,47.17,53.28,...,35.1,39.44,33.14,43.12,24.11,33.05,37.89,44.36,20.34,47.65


In [5]:
cbsa_key.head()

10180                    Abilene, TX
10420                      Akron, OH
10500                     Albany, GA
10580    Albany-Schenectady-Troy, NY
10740                Albuquerque, NM
Name: nan, dtype: object

In [6]:
cbsa_key.to_csv('cbsa_key.csv')

In [6]:
def cbsa_decoder(code):
    return cbsa_key[code]
#for readability's sake, I decided to keep the numerical code as the sole index of the CBSA-level dataframe.
#this function lets the user switch from the code to the locality's name. A simple illustration: 
N = 42
cbsa_decoder(hpi_cbsalevel.columns[N])

'Boston-Quincy, MA  (MSAD)'

We'll convert the HPIs to yearly HPAs, assuming continuous appreciation. For a given HPI series at time (month) $t,$ our HPA is a number $r$ such that $HPI(t - 12) * e^r = HPI(t).$ Solving for $r$, we get
$$ r = \ln (HPI(t) - HPI(t - 12)) $$

In [7]:
hpa_statelevel = pd.DataFrame(index = hpi_statelevel.index[12:], columns = hpi_statelevel.columns)
for region in hpa_statelevel.columns:
    hpa_statelevel[region] = np.log(hpi_statelevel[region] / hpi_statelevel[region].shift(12)).iloc[12:]
hpa_statelevel.head()

Unnamed: 0,US,AK,AL,AR,AZ,CA,CO,CT,DC,DE,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
Jan-76,0.054808,0.019734,0.020308,-0.063318,-0.019709,0.132351,0.009199,0.025867,0.151928,0.031229,...,-0.074847,0.041569,0.079808,0.114295,-0.035269,-0.100745,0.071951,0.082478,0.406221,0.091003
Feb-76,0.046393,0.051247,0.063532,0.005228,-0.020817,0.136002,-0.001215,0.047484,0.080627,0.036045,...,-0.111149,0.032641,0.047688,0.10402,-0.008581,-0.155864,0.083397,0.051282,0.239865,0.107899
Mar-76,0.053776,0.07708,0.061408,0.057833,-0.001587,0.144372,0.035727,0.037912,0.082152,0.034089,...,-0.089766,0.018015,0.064002,0.088404,0.005714,-0.253334,0.099394,0.049306,0.140845,0.120833
Apr-76,0.061108,0.102875,0.058852,0.112383,0.016067,0.152605,0.070766,0.028612,0.083222,0.031979,...,-0.06986,0.002833,0.079131,0.07279,0.020158,-0.356827,0.114579,0.04756,0.056797,0.133834
May-76,0.068089,0.128826,0.056455,0.168672,0.03293,0.160104,0.10495,0.019194,0.084976,0.030515,...,-0.051312,-0.012036,0.093747,0.057574,0.035042,-0.466936,0.129364,0.04583,-0.015677,0.146902


In [8]:
hpa_cbsalevel = pd.DataFrame(index = hpi_cbsalevel.index[12:], columns = hpi_cbsalevel.columns)
for region in hpa_cbsalevel.columns:
    hpa_cbsalevel[region] = np.log(hpi_cbsalevel[region] / hpi_cbsalevel[region].shift(12)).iloc[12:]
hpa_cbsalevel.head()

Unnamed: 0,10180,10420,10500,10580,10740,10780,10900,11020,11100,11180,...,48864,48900,49020,49180,49340,49420,49620,49660,49700,49740
Jan-76,0.079715,0.013324,-0.03335,-0.052516,0.061249,0.13714,-0.002909,0.007733,0.079811,0.19123,...,0.033226,0.05802,-0.027594,0.058117,0.012673,0.071844,0.007561,0.012186,0.132385,-0.019716
Feb-76,0.047793,0.015401,-0.03574,-0.076405,0.074452,0.130156,0.007587,0.018906,0.047729,0.155221,...,0.036543,0.042817,-0.00428,0.042844,-0.001608,0.083234,0.018904,0.015961,0.13605,-0.020773
Mar-76,0.064005,0.030153,-0.018575,-0.05344,0.06027,0.10021,0.043055,0.05496,0.064067,0.121752,...,0.038206,0.040156,0.00838,0.040166,0.018504,0.099281,0.054867,0.034365,0.1441,-0.001751
Apr-76,0.079228,0.044724,-0.001258,-0.031168,0.046205,0.071268,0.077743,0.090315,0.079408,0.09035,...,0.040355,0.037268,0.021078,0.037283,0.038637,0.1146,0.090416,0.05254,0.152378,0.01615
May-76,0.09372,0.059106,0.016004,-0.007922,0.032237,0.043485,0.112256,0.125587,0.093632,0.060448,...,0.04212,0.034392,0.034113,0.034419,0.058402,0.129514,0.125582,0.070284,0.160033,0.033027


Now I'll merge the two dataframe and convert rates to percentages.

In [9]:
hpa_yoy = pd.concat([hpa_statelevel,hpa_cbsalevel],axis=1)
hpa_yoy *= 100
hpa_yoy.head()
#the AD-Co

Unnamed: 0,US,AK,AL,AR,AZ,CA,CO,CT,DC,DE,...,48864,48900,49020,49180,49340,49420,49620,49660,49700,49740
Jan-76,5.480824,1.973388,2.030833,-6.331845,-1.970858,13.235145,0.919881,2.586679,15.192763,3.122885,...,3.322565,5.802004,-2.759384,5.811717,1.267344,7.184417,0.756052,1.218582,13.238511,-1.971637
Feb-76,4.639304,5.124668,6.353221,0.522801,-2.081703,13.600222,-0.121521,4.748443,8.062676,3.604509,...,3.65434,4.281692,-0.428022,4.284395,-0.160836,8.323386,1.890415,1.596132,13.604965,-2.077293
Mar-76,5.37764,7.70803,6.140799,5.783322,-0.158748,14.437161,3.572659,3.791205,8.215185,3.408949,...,3.820623,4.015596,0.838034,4.016604,1.850415,9.928147,5.486661,3.436461,14.410034,-0.175131
Apr-76,6.110799,10.287475,5.885228,11.238342,1.606725,15.260508,7.076606,2.861153,8.322236,3.197908,...,4.03546,3.726829,2.107806,3.728276,3.86366,11.459951,9.041624,5.253999,15.237845,1.614995
May-76,6.808912,12.882642,5.645522,16.867239,3.293007,16.010444,10.49497,1.919445,8.4976,3.051482,...,4.21203,3.439163,3.411275,3.441935,5.840194,12.951426,12.558223,7.028449,16.003305,3.302674


The above matrix is our 'target' data. We'll now format the inputs for the regression, 'core' HPA data from 8 geographical indicies, a 25-MSA composite and 7 Cities (Los Angeles, Miami, New York, Phoenix, Chicago, Detroit, Washington D.C., and Oakland). Another column, called 'common factor,' will model national affordability of housing / income inflation, which augments HPA across the board. 

In [9]:
core_hpa = pd.read_csv('core_hpa.csv', index_col=0, usecols = [0,1,2,3,4,5,6,7,8,9,13])
core_hpa.head(5)

Unnamed: 0_level_0,Comp,LA,MIA,NY,PX,CHI,DET,DC,OAK,Common Factor
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Feb-75,7.163584,15.991952,112.132818,-16.503991,18.761003,-21.064721,16.301673,-11.399706,11.280149,0.0
Mar-75,9.124432,17.66939,143.93584,-21.108423,25.359636,-27.541091,17.589398,-14.983387,10.520615,0.0
Apr-75,9.028572,17.412988,128.576844,-21.583389,24.834782,-27.840286,17.335295,-15.564421,11.078004,0.0
May-75,8.946155,17.622008,116.123736,-22.188464,24.331215,-28.857696,17.08843,-15.768952,10.333756,0.0
Jun-75,-4.207556,2.666755,-122.536481,11.709936,-21.746678,13.811573,5.545431,11.059986,10.882977,0.0


For consistency's sake, we'll convert these values to a yearly rolling average.

In [10]:
core_hpa_yoy = pd.DataFrame(index = core_hpa.index[11:], columns = core_hpa.columns)
for region in core_hpa_yoy.columns:
    yearly_means = core_hpa[region].rolling(12).mean()
    core_hpa_yoy[region] = yearly_means.iloc[11:]
core_hpa_yoy.head()

Unnamed: 0_level_0,Comp,LA,MIA,NY,PX,CHI,DET,DC,OAK,Common Factor
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Jan-76,4.405653,15.023456,14.670379,-2.238901,-1.98346,-2.505293,6.082195,-3.88804,15.367438,0.0
Feb-76,3.951997,15.437498,5.831433,-2.299175,-2.067081,-1.140961,5.497794,-3.364924,16.595276,0.0
Mar-76,4.846395,15.931904,0.719793,-0.360874,-0.154202,2.101212,4.440813,-0.258065,16.183675,0.0
Apr-76,5.725254,16.409737,-3.548032,1.611113,1.646568,5.331964,3.403343,2.895024,15.723472,0.0
May-76,6.582856,16.833647,-7.162878,3.638787,3.345071,8.63928,2.360987,6.000196,15.323156,0.0


Let $k$ denote one of the 436 regional indices, and let $F$ denote the 'common factor' time series. The idea of this regression problem is to find coefficients $(b_1,...,b_5)$ (corresponding to the core indices) such that the estimation
$$ HPA_k(t) - F(t) \approx \sum_{i=1}^5 \beta_{ki} [HPA^{core}_i(t) - F(t)] $$
is good. The regression will return a 436 x 5 matrix of coefficients. While we could consider $F(t)$ as one of the predictor variables, the effect of this set up is to estimate regional HPA 'independently' of national affordability trends. 

In [11]:
common_factor = core_hpa_yoy['Common Factor'].values
core_hpa_yoy['alpha'] = 1 + common_factor
core_hpa_yoy.head()

Unnamed: 0_level_0,Comp,LA,MIA,NY,PX,CHI,DET,DC,OAK,Common Factor,alpha
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Jan-76,4.405653,15.023456,14.670379,-2.238901,-1.98346,-2.505293,6.082195,-3.88804,15.367438,0.0,1.0
Feb-76,3.951997,15.437498,5.831433,-2.299175,-2.067081,-1.140961,5.497794,-3.364924,16.595276,0.0,1.0
Mar-76,4.846395,15.931904,0.719793,-0.360874,-0.154202,2.101212,4.440813,-0.258065,16.183675,0.0,1.0
Apr-76,5.725254,16.409737,-3.548032,1.611113,1.646568,5.331964,3.403343,2.895024,15.723472,0.0,1.0
May-76,6.582856,16.833647,-7.162878,3.638787,3.345071,8.63928,2.360987,6.000196,15.323156,0.0,1.0


In [29]:
#starts the regression from the same spot as the excel sheet 
core_hpa_yoy = core_hpa_yoy.loc['Dec-92':'Feb-14']
common_factor = core_hpa_yoy['Common Factor'].loc['Dec-92':'Feb-14'].values

In [30]:
X = core_hpa_yoy[['Comp','LA','MIA','NY','PX','alpha']].as_matrix()
for column in X.transpose():
    column -= common_factor
#subtracts common factor from core indices

In [32]:
y = hpa_yoy.loc['Dec-92':'Feb-14'].as_matrix()
for column in y.transpose():
    column -= common_factor
#subtracts common factor from regional indices

We should force the coefficient corresponding to the composite index to be non-negative. The following regression does not make use of a y-intercept (additive constant). 

In [33]:
lower_bounds = [0]
for i in range(5):
    lower_bounds.append(-np.inf)
upper_bounds = []
for i in range(6):
    upper_bounds.append(np.inf)
bounds = (lower_bounds,upper_bounds)

#subtracts common factor from the core indices

In [34]:
from scipy import optimize

In [46]:
%%timeit
l = []
for vector in y.transpose():
    c = optimize.lsq_linear(X,vector,bounds=bounds,)
    l.append(c.x)
w = np.array(l)
coef1 = pd.DataFrame(w, columns=core_hpa_yoy.columns[[0,1,2,3,4,10]], index=hpa_yoy.columns)
regional_r2s = r2_scorer(coef1)
coef1_out = coef1.copy()
coef1_out['R^2'] = regional_r2s.values
coef1_out.to_csv('nikku_coefs1.csv')

1 loop, best of 3: 471 ms per loop


In [45]:
l = []
for vector in y.transpose():
    c = optimize.lsq_linear(X,vector,bounds=bounds,)
    l.append(c.x)
w = np.array(l)

1 loop, best of 3: 343 ms per loop


In [36]:
coef1 = pd.DataFrame(w, columns=core_hpa_yoy.columns[[0,1,2,3,4,10]], index=hpa_yoy.columns)
coef1.head(10)

Unnamed: 0,Comp,LA,MIA,NY,PX,alpha
US,0.7163802,-0.164209,-0.011384,-0.023717,0.08817,-0.70602
AK,1.138827e-20,-0.064405,-0.015488,0.256824,0.084164,-0.249918
AL,0.1650028,-0.188722,-0.082473,0.094904,0.231132,-1.403362
AR,0.3476132,-0.244179,0.052861,-0.012023,0.077532,-1.247501
AZ,3.034664e-21,-0.066669,0.002278,0.195552,0.83021,-0.321737
CA,1.377975,0.412167,0.116641,-0.465987,-0.088648,-0.263767
CO,2.632587,-0.941716,-0.116459,-0.817771,-0.090698,1.291653
CT,0.0562105,0.08946,-0.180179,0.770337,0.078842,-1.299448
DC,1.140099e-20,0.452662,-0.148639,0.507264,0.049288,2.833305
DE,1.1303329999999999e-20,0.091355,-0.143132,0.610165,0.170397,-1.113875


In [40]:
from sklearn import metrics
def r2_scorer(coef_frame):
    data = []
    for region in coef_frame.index:
        y_true = hpa_yoy[region].loc['Dec-92':'Feb-14'].values
        y_pred = np.matmul(X,coef_frame.loc[region].values)
        data.append(metrics.r2_score(y_true,y_pred))
    r2_scores = pd.Series(data=data, index = coef_frame.index, name = "R^2")
    return r2_scores

In [41]:
core_hpa_yoy[['Comp','LA','MIA','NY','PX','alpha']].head()

Unnamed: 0_level_0,Comp,LA,MIA,NY,PX,alpha
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dec-92,-0.450762,-6.381162,1.405117,0.946757,0.876515,4.530808
Jan-93,-1.06075,-6.770253,1.180669,-0.091989,1.141643,4.531541
Feb-93,-1.676955,-7.165928,0.955513,-1.138266,1.407785,4.532273
Mar-93,-1.122327,-7.138376,1.865479,-0.469494,1.993811,4.633662
Apr-93,-0.572371,-7.114759,2.770317,0.189709,2.589173,4.735051


In [42]:
regional_r2s = r2_scorer(coef1)
regional_r2s.head()

US    0.970022
AK    0.311524
AL    0.545894
AR    0.486469
AZ    0.981615
Name: R^2, dtype: float64

In [85]:
#write me own R2, mon
def ar_squared(y, y_hat):
    SSres = sum(map(lambda x: (x[1] - x[0])**2, zip(y,y_hat)))
    SStot = sum([(x - np.mean(y))**2 for x in y])
    return 1 - (SSres/SStot)

In [86]:
ar_squared(hpa_yoy['US'].values[:-1],y_hat = np.matmul(X,coef1.loc['US'].values))

0.92469418705414197

In [21]:
#put this all in a spread sheet
coef1_out = coef1.copy()
coef1_out['R^2'] = regional_r2s.values
#coef1_out.to_csv('nikku_coefs.csv')

In [22]:
display("the mean, unweighted R^2 across all regions is ", regional_r2s.mean())

'the mean, unweighted R^2 across all regions is '

0.5470553190101848

In [32]:
regional_weights = pd.read_csv('regional_weights.csv',index_col=0, names = ['weight'], header=0)
state_total = float(regional_weights.weight.iloc[1:52].sum())
msa_total = float(regional_weights.weight.iloc[53:].sum())

The above score may not seem very good. But we should take into account the fact that not all regions are equal in terms of real estate. We'll make use of a set of weights that match the pecuniary volume of regional mortgages.

We can specify weighted R^2 average across States and MSAs. 

In [33]:
def weighted_R2(r2_scores):
    
    state_data = r2_scores.iloc[1:52].values * (regional_weights.weight.iloc[1:52].values / state_total)
    state_r2s_weighted = pd.Series(data = state_data, index = coef1.index[1:52], name = 'WA state R^2')
    
    
    msa_data = r2_scores.iloc[53:].values * (regional_weights.weight.iloc[53:].values / msa_total)
    msa_r2s_weighted = pd.Series(data = msa_data, index = coef1.index[53:], name = 'WA msa R^2')
    
    return state_r2s_weighted.sum(), msa_r2s_weighted.sum()

In [34]:
weighted_R2(regional_r2s)

(0.7757006937976725, 0.7412372822497271)

Defining, somewhat arbitrarily, an acceptable R^2 score as one greater than 0.6, we can make another estimate of the regressions efficacy by measuring the percent of loans that lie in regions with a +60% R^2. 

In [35]:
state_r2s = regional_r2s.iloc[1:52]
msa_r2s = regional_r2s.iloc[53:]

In [36]:
#returns percentages of loans coming from regions for which we have an R^2 exceeding a given threshold. 
#the first percentage first is state-level, the second for MSAs.
def regression_measure(r2_scores, threshold = 0.6):
    state_r2s = r2_scores.iloc[1:52]
    msa_r2s = r2_scores.iloc[53:]
    states_above_threshold = state_r2s.loc[state_r2s > threshold].index
    perc_state_loans_above = (regional_weights.loc[states_above_threshold] / state_total)['weight'].sum()
    msas_above_threshold = msa_r2s.loc[msa_r2s > threshold].index
    perc_msa_loans_above = (regional_weights.loc[msas_above_threshold] / msa_total)['weight'].sum()
    return perc_state_loans_above, perc_msa_loans_above

In [37]:
regression_measure(regional_r2s)

(0.7936673138393976, 0.7555326915465153)

In [38]:
#we can adjust the threshold as we see fit
regression_measure(regional_r2s,threshold = .67)

(0.7239021613328804, 0.704758388635173)

How does our linear regression perform? Considering states and DC, its mortgage-voume weighted mean R^2 is 77%, and 79% of loans come from regions where the regression scored above 0.6.  On the MSA-level, its weighted average R^2 is 73%, and high-scoring regions account for 75% of all loans .

Let's compare this to the original coefficients. 

In [48]:
coefs_orig_alpha = pd.read_csv('coefs_orig_alpha.csv', index_col=0)
coefs_orig_alpha.head()

Unnamed: 0,Comp,LA,MIA,NY,PX,alpha
US,0.727,-0.174,-0.004,-0.017,0.091,-0.006554
AK,0.0,-0.077,0.004,0.269,0.088,-0.0016
AL,0.193,-0.211,-0.069,0.11,0.236,-0.013017
AR,0.362,-0.262,0.07,0.001,0.082,-0.011495
AZ,0.0,-0.069,0.0,0.205,0.833,-0.003162


In [50]:
regional_r2s_orig = r2_scorer(coefs_orig_alpha)
regional_r2s_orig.head()

US    0.841848
AK   -0.021057
AL    0.254224
AR    0.168963
AZ    0.957375
Name: R^2, dtype: float64

In [51]:
weighted_R2(regional_r2s_orig)

(0.6570546512159562, 0.6119261942711963)

In [52]:
regression_measure(regional_r2s_orig)

(0.6490231052376182, 0.5895830005964188)

In [53]:
#Sad!

In [None]:
#to do