In [26]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
recs = pd.DataFrame.from_csv('recs2009_public_clean.csv', index_col=None)
recs.head()

Unnamed: 0,DOEID,REGIONC,DIVISION,REPORTABLE_DOMAIN,TYPEHUQ,NWEIGHT,HDD65,CDD65,HDD30YR,CDD30YR,...,SCALEEL,KAVALNG,PERIODNG,SCALENG,PERIODLP,SCALELP,PERIODFO,SCALEFO,PERIODKR,SCALEKER
0,1,2,4,12,2,2471.679705,4742,1080,4953,1271,...,0,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,2,4,10,26,2,8599.17201,2662,199,2688,143,...,0,1,1,0,-2,-2,-2,-2,-2,-2
2,3,1,1,1,5,8969.915921,6233,505,5741,829,...,0,3,5,3,-2,-2,-2,-2,-2,-2
3,4,2,3,7,2,18003.6396,6034,672,5781,868,...,3,3,5,3,-2,-2,-2,-2,-2,-2
4,5,1,1,1,3,5999.605242,5388,702,5313,797,...,0,1,1,0,-2,-2,-2,-2,-2,-2


In [4]:
df = recs
cols = list(df)
count = 0
row_count = len(df)

for c in cols:
    dfe = df[df[c].isnull()]
    null_count = len(dfe)
    percent_values = 1 - (null_count / row_count)

    if dfe.empty:
        count += 1
    else:
        print('NaN in ' + c)
        print('Percent not null: ' + str(round(percent_values * 100,2)) + '%')

    if count == len(cols):
        print('No NaN in DataFrame')

No NaN in DataFrame


In [5]:
recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12083 entries, 0 to 12082
Columns: 898 entries, DOEID to SCALEKER
dtypes: float64(51), int64(847)
memory usage: 82.8 MB


In [6]:
train, test = train_test_split(recs)
X_train = train.drop('TOTALBTU',1)
X_test = test.drop('TOTALBTU',1)
y_train = train.TOTALBTU
y_test = test.TOTALBTU
regr = linear_model.LinearRegression()

In [8]:
%%time
regr.fit(X_train,y_train)

Wall time: 502 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
regr.coef_

array([ -8.58374175e-16,   1.74538162e-11,   1.15961962e-11,
        -5.92420557e-12,   1.20084498e-11,  -4.57966998e-16,
        -6.77236045e-15,   7.59114993e-15,  -2.61179967e-14,
        -1.74461140e-14,  -1.06146840e-12,  -2.32938077e-11,
         7.48027841e-12,   5.08933792e-12,   1.08333348e-12,
        -7.43408324e-12,   1.29369487e-12,   3.68042625e-11,
        -4.04518566e-11,   2.89360070e-11,  -6.44425963e-13,
        -3.53473761e-14,   2.75794724e-13,   1.33607050e-12,
        -4.77015302e-11,   1.07896543e-11,   2.60979281e-13,
        -2.07099656e-11,  -1.89124827e-11,  -1.80875061e-11,
         1.29177007e-11,  -1.15989581e-11,   1.65051943e-11,
         2.07918976e-10,  -8.77947394e-12,  -9.13425046e-13,
        -1.07610713e-11,   4.05025705e-12,  -4.41382105e-11,
         1.39478782e-11,  -2.24967165e-12,  -9.94670029e-11,
         3.60235834e-11,  -7.43551858e-12,   2.90424904e-11,
        -2.58536399e-10,  -2.37909247e-11,   1.13462983e-11,
        -1.45531304e-10,

In [15]:
regr.score(X_train,y_train)

1.0

In [16]:
regr.score(X_test,y_test)

1.0

In [12]:
regr.predict(X_test)

array([ 120812.,   49269.,  142085., ...,   15985.,   40975.,  131458.])

In [29]:
s = SelectKBest(f_classif, k=3)
s

SelectKBest(k=3, score_func=<function f_classif at 0x00000195D1CA68C8>)

In [30]:
s.fit(X_train,y_train)

 102 104 107 108 159 171 173 174 175 178 179 180 181 182 183 184 186 187
 190 192 195 197 198 199 200 201 202 205 309 314 316 322 323 324 325 326
 328 329 330 331 332 334 345 347 348 349 350 361 363 364 365 366 371 372
 376 377 380 387 388 389 390 391 392 393 394 395 396 397 398 400 401 402
 420 451 457 458 459 460 461 462 468 469 473 474 491 497 534 535 551 557
 599 601 602 603 604 605 609 610 611 618 636 648 650 653 654 657 681 695
 696 701 702 706 707 710 713 714 715 716 718 736 737 738 739 740 769 770
 771 772 773 774 775 776 777 778 792 851 855 859 862 863 866 867 870 871] are constant.
  f = msb / msw
  f = msb / msw


SelectKBest(k=3, score_func=<function f_classif at 0x00000195D1CA68C8>)

In [17]:
t = s.transform(X_train)
t[0]

array([ 0.,  0.,  0.])

In [18]:
# Compare the min and max of each column. If they ate equal, column is constant.
def constantCheck(df):
    s1 = df.max()
    s2 = df.min()
    s = s1 == s2
    s = s[s == True]
    l = s.axes[0].tolist()
    return l

In [31]:
l_train = constantCheck(X_train)
if not l_train:
    print('Empty List')
else:
    print(l_train)

['ZATTICUSE', 'ZMICRO', 'ZTOPGRILL', 'ZNOCORD', 'ZANSMACH', 'ELOTHER', 'ZOTHERWAYLPG', 'ZAGEHHMEMCAT13', 'ZAGEHHMEMCAT14']


In [32]:
l_test = constantCheck(X_test)
if not l_test:
    print('Empty List')
else:
    print(l_test)

['ZSTORIES', 'ZATTCHT2', 'ZPCTATTHT', 'ZATTCCL2', 'ZSTOVEN', 'ZSTOVENFUEL', 'ZSTOVEFUEL', 'ZOUTGRILLFUEL', 'ZNUMMEAL', 'ZNUMFRIG', 'ZTYPERFR1', 'ZICE', 'ZTYPERFR3', 'ZFREEZER2', 'ZDRYRUSE', 'ZCABLESAT1', 'ZTVAUDIOSYS1', 'ZTVONWD1', 'ZTVONWDWATCH1', 'ZTVONWE1', 'ZTVONWEWATCH1', 'ZCABLESAT2', 'ZPLAYSTA2', 'ZVCR2', 'ZDVD2', 'ZOTHERSTB2', 'ZCABLESAT3', 'ZPLAYSTA3', 'ZTVAUDIOSYS3', 'ZOTHERSTB3', 'ZTVONWDWATCH3', 'ZTVONWEWATCH3', 'ZPCTYPE2', 'ZMONITOR2', 'ZPCSLEEP2', 'ZPCTYPE3', 'ZMONITOR3', 'ZDIPSTICK', 'ZSWAMPCOL', 'ZAQUARIUM', 'ZELECCHRG', 'ZCHRGPLGE', 'ZDIFFUEL', 'ZTHERMAIN', 'ZNUMTHERM', 'ZHIGHCEIL', 'ZPOOL', 'ZRECBATH', 'ZSLDDRS', 'ZDRAFTY', 'KRWATER', 'SOLWARM', 'SOLARAUX', 'ZOTHERWAYEL', 'ZPUGCOOK', 'ZKERODEL', 'ZKEROCASH', 'GALLONKERWTH', 'BTUKERWTH', 'DOLKERWTH']


In [39]:
for l in l_train:
    if l in l_test:
        print(l + ' in l_test')
    else:
        print(l + ' NOT in l_test')

ZATTICUSE NOT in l_test
ZMICRO NOT in l_test
ZTOPGRILL NOT in l_test
ZNOCORD NOT in l_test
ZANSMACH NOT in l_test
ELOTHER NOT in l_test
ZOTHERWAYLPG NOT in l_test
ZAGEHHMEMCAT13 NOT in l_test
ZAGEHHMEMCAT14 NOT in l_test


In [40]:
X_train = X_train.drop(l_test, axis = 1)

In [41]:
s.fit(X_train,y_train)

 155 167 168 171 172 173 174 175 180 183 185 186 187 188 189 295 301 306
 307 308 309 311 319 321 322 323 332 333 334 343 350 351 352 353 354 355
 356 357 358 360 378 409 415 416 417 418 419 423 424 428 429 446 452 489
 490 506 512 555 556 557 561 562 569 586 598 600 603 606 628 642 643 648
 651 652 655 656 657 658 659 661 679 680 681 682 683 712 713 714 715 716
 717 718 719 720 721 735 794 798 802 805 808 811] are constant.
  f = msb / msw
  f = msb / msw


SelectKBest(k=3, score_func=<function f_classif at 0x00000195D1CA68C8>)