In [142]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [143]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [144]:
recs = pd.DataFrame.from_csv('recs2009_public.csv', index_col=None)
recs.head()

Unnamed: 0,DOEID,REGIONC,DIVISION,REPORTABLE_DOMAIN,TYPEHUQ,NWEIGHT,HDD65,CDD65,HDD30YR,CDD30YR,...,SCALEEL,KAVALNG,PERIODNG,SCALENG,PERIODLP,SCALELP,PERIODFO,SCALEFO,PERIODKR,SCALEKER
0,1,2,4,12,2,2471.679705,4742,1080,4953,1271,...,0,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,2,4,10,26,2,8599.17201,2662,199,2688,143,...,0,1,1,0,-2,-2,-2,-2,-2,-2
2,3,1,1,1,5,8969.915921,6233,505,5741,829,...,0,3,5,3,-2,-2,-2,-2,-2,-2
3,4,2,3,7,2,18003.6396,6034,672,5781,868,...,3,3,5,3,-2,-2,-2,-2,-2,-2
4,5,1,1,1,3,5999.605242,5388,702,5313,797,...,0,1,1,0,-2,-2,-2,-2,-2,-2


In [145]:
r = recs[['TOTSQFT','TOTROOMS','TOTALBTU','TOTALDOL']]
r.head()

Unnamed: 0,TOTSQFT,TOTROOMS,TOTALBTU,TOTALDOL
0,5075,9,63006,1315
1,3136,4,103460,1293
2,528,2,58716,1327
3,2023,7,76401,1398
4,1912,5,59809,1558


In [146]:
div = {'REGIONC_1':'Northeast',
       'REGIONC_2':'Midwest',
       'REGIONC_3':'South',
       'REGIONC_4':'West'}

s = recs[['REGIONC']]
s.head()

Unnamed: 0,REGIONC
0,2
1,4
2,1
3,2
4,1


In [147]:
d = pd.get_dummies(s, columns=['REGIONC'])
d.head()

Unnamed: 0,REGIONC_1,REGIONC_2,REGIONC_3,REGIONC_4
0,0,1,0,0
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0
4,1,0,0,0


In [148]:
d = d.rename(columns=div)
d = d.astype(bool)
d.head()

Unnamed: 0,Northeast,Midwest,South,West
0,False,True,False,False
1,False,False,False,True
2,True,False,False,False
3,False,True,False,False
4,True,False,False,False


In [149]:
df = r.join(d)
df.head()

Unnamed: 0,TOTSQFT,TOTROOMS,TOTALBTU,TOTALDOL,Northeast,Midwest,South,West
0,5075,9,63006,1315,False,True,False,False
1,3136,4,103460,1293,False,False,False,True
2,528,2,58716,1327,True,False,False,False
3,2023,7,76401,1398,False,True,False,False
4,1912,5,59809,1558,True,False,False,False


In [150]:
train, test = train_test_split(df)

In [151]:
df_train = train.drop('TOTALBTU',1)

In [152]:
df_test = test.drop('TOTALBTU',1)

In [153]:
regr = linear_model.LinearRegression()

In [154]:
%%time
regr.fit(df_train,train.TOTALBTU)

Wall time: 5 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [155]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [  3.87089551e+00   2.18952680e+03   3.26348061e+01  -1.74092669e+03
   2.02109874e+04  -1.63857112e+04  -2.08434952e+03]


In [156]:
print('Variance score: %.2f' % regr.score(df_train,train.TOTALBTU))

Variance score: 0.74


In [157]:
regr.score(df_train,train.TOTALBTU)

0.73745824789699599

In [158]:
regr.predict(df_test)

array([  73979.51477865,  119852.97385742,   46783.59802828, ...,
        111854.85779235,   63584.36400424,   13662.87079805])

In [159]:
recs = recs.drop(['METROMICRO','UR'],1)

In [160]:
recs = recs.drop([1387,2092], axis=0)
recs.head()

Unnamed: 0,DOEID,REGIONC,DIVISION,REPORTABLE_DOMAIN,TYPEHUQ,NWEIGHT,HDD65,CDD65,HDD30YR,CDD30YR,...,SCALEEL,KAVALNG,PERIODNG,SCALENG,PERIODLP,SCALELP,PERIODFO,SCALEFO,PERIODKR,SCALEKER
0,1,2,4,12,2,2471.679705,4742,1080,4953,1271,...,0,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,2,4,10,26,2,8599.17201,2662,199,2688,143,...,0,1,1,0,-2,-2,-2,-2,-2,-2
2,3,1,1,1,5,8969.915921,6233,505,5741,829,...,0,3,5,3,-2,-2,-2,-2,-2,-2
3,4,2,3,7,2,18003.6396,6034,672,5781,868,...,3,3,5,3,-2,-2,-2,-2,-2,-2
4,5,1,1,1,3,5999.605242,5388,702,5313,797,...,0,1,1,0,-2,-2,-2,-2,-2,-2


In [169]:
train, test = train_test_split(recs)

In [170]:
df_train = train.drop('TOTALBTU',1)
df_test = test.drop('TOTALBTU',1)
regr = linear_model.LinearRegression()

In [171]:
df = recs
cols = list(df)
count = 0

for c in cols:
    dfe = df[df[c].isnull()]

    if dfe.empty:
        count += 1
    else:
        print('NaN in ' + c)

    if count == len(cols):
        print('No NaN in DataFrame')

No NaN in DataFrame


In [164]:
recs[recs.NOCRCASH.isnull()].NOCRCASH

Series([], Name: NOCRCASH, dtype: float64)

In [165]:
recs[recs.NKRGALNC.isnull()].NKRGALNC

Series([], Name: NKRGALNC, dtype: float64)

In [172]:
%%time
regr.fit(df_train,train.TOTALBTU)

Wall time: 518 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [176]:
regr.score(df_train,train.TOTALBTU)

1.0

In [178]:
regr.coef_

array([  3.40981941e-16,   3.32911476e-12,   3.06715226e-11,
        -7.11203318e-12,  -2.43311656e-11,  -1.05471187e-15,
        -1.58761893e-14,  -1.14908083e-14,   1.31006317e-14,
         1.90819582e-15,  -1.01900096e-15,   2.36398601e-12,
         1.40157035e-11,   9.47024468e-13,   3.66470146e-13,
        -3.89359779e-12,   2.38163985e-12,   2.22236726e-11,
        -1.08932523e-11,   2.26922575e-12,  -2.29058197e-13,
        -1.88542758e-14,   9.20609712e-13,  -1.76197815e-12,
        -1.31018132e-11,   7.63227101e-14,   3.01163987e-13,
         1.50421596e-11,  -1.06545693e-11,  -1.13683638e-11,
        -2.47242802e-11,  -1.29992281e-11,   1.84601109e-11,
         1.91033238e-10,  -2.76414657e-12,  -2.18767529e-12,
        -2.16391064e-11,   4.42137455e-12,  -2.23655958e-11,
         9.18626912e-12,  -2.37106802e-12,  -2.88027704e-11,
         1.27559706e-11,  -1.16898031e-12,  -1.58102711e-11,
        -2.12633152e-10,  -1.55784567e-11,   7.93519317e-12,
        -7.51823729e-10,

In [179]:
regr.predict(df_test)

array([ 101795.,  163907.,   61244., ...,  132550.,  200678.,   63319.])

In [180]:
recs.TOTALBTU

0         63006
1        103460
2         58716
3         76401
4         59809
5        114350
6        150726
7         78230
8         52677
9         69166
10        46796
11       142273
12        33352
13        21615
14        53012
15       102132
16         7094
17        80305
18        99417
19       123923
20       123524
21       100021
22       144796
23        39555
24       190121
25       152933
26        39879
27        39999
28         8349
29        44823
          ...  
12053     32523
12054    125562
12055     84766
12056     85516
12057     44899
12058     49789
12059    279512
12060    118213
12061    121706
12062     97071
12063    175190
12064     23961
12065    129196
12066     36834
12067     61052
12068     95639
12069    137773
12070     71708
12071    245679
12072    153193
12073    101169
12074     77387
12075     93912
12076     28439
12077     73513
12078     75702
12079     25251
12080    148252
12081     81978
12082     38100
Name: TOTALBTU, dtype: i