In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression, SelectPercentile
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import variance_threshold
np.random.seed(42)


%matplotlib inline



In [2]:
df = pd.read_csv('../Data/train.csv', index_col='Id')

In [3]:
df.drop(['PID'], axis=1, inplace=True)

In [4]:
df_obj = df.select_dtypes(include = ['object'])
df_int = df.select_dtypes(exclude = ['object'])

In [5]:
df_obj_d = pd.get_dummies(df_obj, drop_first=True)

In [6]:
df_con = pd.concat((df_int, df_obj_d), axis=1)
df_con.fillna(0, inplace=True)

In [7]:
df_con.shape

(2051, 248)

In [8]:
X = df_con.drop(['SalePrice'], axis=1)
y = df['SalePrice'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
var = VarianceThreshold(threshold = 0.15)
ss = StandardScaler()

In [11]:
X_train.shape

(1538, 247)

In [12]:
X_train_var = var.fit_transform(X_train)
X_test_var = var.transform(X_test)

# preserve the column names
X_columns = X[X.columns[var.get_support()]]
X_columns.head()

In [60]:
X_train_var_sc = ss.fit_transform(X_train_var)
X_test_var_sc = ss.transform(X_test_var)

In [61]:
kbest = SelectKBest(f_regression, k=50)

X_train_kbest = kbest.fit_transform(X_train_var_sc, y_train)

X_test_kbest = kbest.transform(X_test_var_sc)

X_columns_sc = X_columns[X_columns.columns[kbest.get_support()]]

In [17]:
# feature selection - kbest, kpercentile, var thresh, ss
# scoring/testing - LR, ridge, lasso, elastic_net

In [18]:
ls = LassoCV()

In [19]:
ls.fit(X_train_kbest, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [20]:
ls.score(X_train_kbest, y_train)

0.8372692125270058

In [21]:
ls.score(X_test_kbest, y_test)

0.8827101757787291

In [64]:
ls.coef_

array([    0.        ,  3783.42927149, 23959.03636769,  5509.26960782,
        5725.29211771,  5295.79041871,   456.30001342,    -0.        ,
        4398.66961205,     0.        ,   291.18141683, 15881.63691399,
        4306.93477209,   987.47591699,     0.        ,  2315.51193028,
        3394.40101681,    -0.        ,  6131.47513579,  1964.17524192,
        1872.89862455,     0.        ,  4963.70445724,  2261.29199971,
       -1129.83848697,  -380.25030153,    -0.        ,  3208.28263129,
         102.72362312,   192.78615597, -2299.3186793 ,     0.        ,
       -2005.49005292, -5236.14647184,    -0.        ,  2445.13383171,
       -6779.02475416, -2961.40469098, -2430.66298063,  2142.28763945,
       -2415.75648131, -2288.04140945, -8259.99308926, -8617.43555551,
        2472.22543632,     0.        ,     0.        ,   -40.25048347,
        -284.80946928,  -681.59712913])

In [23]:
#ls.alphas_

In [24]:
#coef = pd.Series(ls.coef_, index = X_train.columns)


In [38]:
kaggle = pd.read_csv('../Data/test.csv', index_col='Id')
kaggle.drop(['PID'], axis=1, inplace=True)

In [39]:
kaggle_int = kaggle.select_dtypes(include = ['object'])
kaggle_int_d = pd.get_dummies(kaggle_int, drop_first=True)
kaggle.drop(['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour',
       'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',
       'Central Air', 'Electrical', 'Kitchen Qual', 'Functional',
       'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',
       'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature',
       'Sale Type', 'BsmtFin SF 2'], axis=1, inplace=True)
kaggle_con = pd.concat((kaggle, kaggle_int_d), axis=1)
kaggle.fillna(0, inplace=True)

#columns = X_columns.columns
#kaggle = kaggle[columns]

In [40]:
# Get missing columns in the training test
missing_cols = set( X_columns_sc.columns ) - set( kaggle.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    kaggle[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
kaggle = kaggle[X_columns_sc.columns]

In [50]:
missing_cols = set( X_columns.columns ) - set( kaggle.columns )
for c in missing_cols:
    kaggle[c] = 0
kaggle = kaggle[X_columns.columns]

In [51]:
ss.transform(kaggle)

array([[-1.31895561e+00,  3.25920062e-01, -1.49966698e-01, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01],
       [-1.31895561e+00, -1.71210297e+00, -5.41267526e-02, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01],
       [-1.31895561e+00,  1.01784004e-03,  1.31749032e+00, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01],
       ...,
       [-1.31895561e+00, -8.75918569e-02, -3.14369067e-01, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01],
       [-1.31895561e+00,  6.00909713e-02, -1.76138376e-01, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01],
       [-1.31895561e+00,  3.55456628e-01, -2.86722929e-01, ...,
        -5.80853944e-01, -6.35046984e-01, -8.35207605e-01]])

In [52]:
# Get missing columns in the training test
missing_cols = set( X_columns_sc.columns ) - set( kaggle.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    kaggle[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
kaggle = kaggle[X_columns_sc.columns]

In [53]:
preds = ls.predict(kaggle)

In [54]:
kaggle.shape

(879, 50)

In [55]:
submission = pd.DataFrame(preds, index=kaggle.index, columns=['SalePrice'])

In [56]:
submission.sort_index(inplace=True)

In [57]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,86936360.0
4,109028000.0
6,92279120.0
7,70059280.0
17,100994300.0


In [36]:
##submission.to_csv('./numeric_only_lr_2b.csv')