In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
recs = pd.DataFrame.from_csv('recs2009_public_clean.csv', index_col=None)
recs.head()

Unnamed: 0,DOEID,REGIONC,DIVISION,REPORTABLE_DOMAIN,TYPEHUQ,NWEIGHT,HDD65,CDD65,HDD30YR,CDD30YR,...,SCALEEL,KAVALNG,PERIODNG,SCALENG,PERIODLP,SCALELP,PERIODFO,SCALEFO,PERIODKR,SCALEKER
0,1,2,4,12,2,2471.679705,4742,1080,4953,1271,...,0,-2,-2,-2,-2,-2,-2,-2,-2,-2
1,2,4,10,26,2,8599.17201,2662,199,2688,143,...,0,1,1,0,-2,-2,-2,-2,-2,-2
2,3,1,1,1,5,8969.915921,6233,505,5741,829,...,0,3,5,3,-2,-2,-2,-2,-2,-2
3,4,2,3,7,2,18003.6396,6034,672,5781,868,...,3,3,5,3,-2,-2,-2,-2,-2,-2
4,5,1,1,1,3,5999.605242,5388,702,5313,797,...,0,1,1,0,-2,-2,-2,-2,-2,-2


In [4]:
df = recs
cols = list(df)
count = 0
row_count = len(df)

for c in cols:
    dfe = df[df[c].isnull()]
    null_count = len(dfe)
    percent_values = 1 - (null_count / row_count)

    if dfe.empty:
        count += 1
    else:
        print('NaN in ' + c)
        print('Percent not null: ' + str(round(percent_values * 100,2)) + '%')

    if count == len(cols):
        print('No NaN in DataFrame')

No NaN in DataFrame


In [5]:
recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12083 entries, 0 to 12082
Columns: 898 entries, DOEID to SCALEKER
dtypes: float64(51), int64(847)
memory usage: 82.8 MB


In [6]:
train, test = train_test_split(recs)
X_train = train.drop('TOTALBTU',1)
X_test = test.drop('TOTALBTU',1)
y_train = train.TOTALBTU
y_test = test.TOTALBTU
regr = linear_model.LinearRegression()

In [7]:
%%time
regr.fit(X_train,y_train)

Wall time: 1.63 s


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
regr.coef_

array([  1.23094051e-15,  -2.51787480e-11,  -8.44715964e-12,
         2.17514895e-12,   1.17794663e-12,  -6.66133815e-16,
         2.16493490e-15,   1.22679644e-14,  -1.82354132e-14,
        -1.59091490e-14,  -1.00442755e-12,  -8.53379307e-12,
        -6.86842710e-12,   2.07596560e-12,  -4.07529777e-13,
         7.68465934e-12,  -5.17909595e-13,   7.02188009e-12,
         2.26656350e-12,  -5.81118182e-12,   7.39563006e-13,
        -1.78207058e-14,   2.51143185e-13,   1.32689348e-13,
         7.03151128e-11,   7.53522671e-12,   6.69512406e-13,
        -6.69676621e-12,   2.87949562e-11,  -1.70449346e-11,
        -2.22380047e-11,   2.79622366e-11,  -2.63887138e-11,
         1.26236244e-10,  -2.06010958e-12,   2.84440983e-12,
        -1.79618702e-11,   5.78171714e-12,   3.41103703e-11,
        -1.03100007e-11,   1.10196446e-12,  -7.07619940e-11,
         2.56116368e-11,  -6.21169056e-12,  -1.02191587e-11,
        -1.37580667e-10,  -5.17638223e-11,   1.18980012e-11,
        -6.96806372e-10,

In [9]:
regr.score(X_train,y_train)

1.0

In [10]:
regr.score(X_test,y_test)

1.0

In [11]:
regr.predict(X_test)

array([ 103944.,  167707.,   50300., ...,   24985.,  128835.,  113740.])

In [12]:
s = SelectKBest(f_classif, k=1)
s

SelectKBest(k=1, score_func=<function f_classif at 0x0000020AFDBC58C8>)

In [13]:
s.fit(X_train,y_train)

 100 101 102 104 107 108 171 173 174 175 178 179 180 182 183 184 186 187
 190 197 198 200 201 202 205 309 313 314 316 322 326 328 329 330 331 332
 334 345 347 349 350 361 363 371 372 376 377 388 389 390 391 392 393 394
 395 396 397 398 400 401 402 420 451 457 458 459 462 464 466 473 474 491
 497 534 535 551 557 599 601 602 603 604 605 609 610 611 612 615 618 636
 648 653 654 665 666 695 696 701 702 706 707 710 713 714 715 716 718 738
 739 740 769 770 771 772 773 774 775 776 777 778 862 863 866 867 870 871] are constant.
  f = msb / msw
  f = msb / msw


SelectKBest(k=1, score_func=<function f_classif at 0x0000020AFDBC58C8>)

In [14]:
t = s.transform(X_train)
t[0]

array([ 0.])

In [15]:
# Compare the min and max of each column. If they ate equal, column is constant.
def constantCheck(df):
    s1 = df.max()
    s2 = df.min()
    s = s1 == s2
    s = s[s == True]
    l = s.axes[0].tolist()
    return l

In [16]:
l_train = constantCheck(X_train)
if not l_train:
    print('Empty List')
else:
    print(l_train)

['ZNUMFRIG', 'ZTYPERFR1', 'ZSWAMPCOL']


In [17]:
l_test = constantCheck(X_test)
if not l_test:
    print('Empty List')
else:
    print(l_test)

['ZSTORIES', 'ZBASEHT2', 'ZPCTBSTHT', 'ZPCTATTHT', 'ZATTICUSE', 'ZSTOVEN', 'ZSTOVEFUEL', 'ZMICRO', 'ZAMTMICRO', 'ZOUTGRILLFUEL', 'ZTOPGRILL', 'ZTOASTER', 'ZNUMMEAL', 'ZMONRFRI2', 'ZTYPERFR3', 'ZREFRIGT3', 'ZMONRFRI3', 'ZUPRTFRZR', 'ZFREEZER2', 'ZDRYRUSE', 'ZTVONWDWATCH1', 'ZTVONWEWATCH1', 'ZCABLESAT2', 'ZCABLESAT3', 'ZTVAUDIOSYS3', 'ZTVONWDWATCH3', 'ZCOMPUTER', 'ZNUMPC', 'ZPCTYPE1', 'ZPCTYPE2', 'ZPCTYPE3', 'ZPCSLEEP3', 'ZDIPSTICK', 'ZAQUARIUM', 'ZSTEREO', 'ZNOCORD', 'ZANSMACH', 'ZFUELNOHEAT', 'ZDIFFUEL', 'ZNUMBERAC', 'ZTREESHAD', 'ZHIGHCEIL', 'ZPOOL', 'ZRECBATH', 'ZSLDDRS', 'ELOTHER', 'KRWATER', 'ZOTHERWAYLPG', 'ZKERODEL', 'ZKEROCASH', 'AGEHHMEMCAT13', 'AGEHHMEMCAT14', 'ZAGEHHMEMCAT7', 'ZAGEHHMEMCAT8', 'ZAGEHHMEMCAT9', 'ZAGEHHMEMCAT10', 'ZAGEHHMEMCAT11', 'ZAGEHHMEMCAT12', 'ZAGEHHMEMCAT13', 'ZAGEHHMEMCAT14', 'GALLONKERWTH', 'BTUKERWTH', 'DOLKERWTH']


In [18]:
for l in l_train:
    if l in l_test:
        print(l + ' in l_test')
    else:
        print(l + ' NOT in l_test')

ZNUMFRIG NOT in l_test
ZTYPERFR1 NOT in l_test
ZSWAMPCOL NOT in l_test


In [19]:
X_train = X_train.drop(l_test, axis = 1)

In [20]:
s.fit(X_train,y_train)

 103 167 168 171 173 174 177 183 184 290 294 296 302 306 308 309 310 322
 324 326 337 342 346 356 357 358 359 360 361 363 364 365 383 419 420 421
 425 427 434 435 452 458 495 496 516 559 560 561 565 566 567 570 573 602
 607 618 619 648 649 654 655 659 660 663 664 665 666 668 688 717 718 802
 805 808] are constant.
  f = msb / msw
  f = msb / msw


SelectKBest(k=1, score_func=<function f_classif at 0x0000020AFDBC58C8>)