In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
import math
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor

# Load some data
full_train = pd.read_csv('train.csv', squeeze=True)
test = pd.read_csv('test.csv', squeeze=True)
sample_submission = pd.read_csv('sample_submission.csv', squeeze=True)

In [3]:
# Exploratory Data Analysis
print full_train.shape
print test.shape

(1460, 81)
(1459, 80)


In [4]:
# Categorical NA
naCounts = full_train.isna().sum()
naVals = naCounts[naCounts>0].sort_values(ascending=False)
naVals

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64

In [5]:
# NA Columns we are choosing to remove
naCols = [
"PoolQC",
"MiscFeature",
"Alley",
"Fence",
"FireplaceQu",
"LotFrontage",
"GarageYrBlt",
"GarageType",
"GarageFinish",
"GarageQual",
"GarageCond",
]

In [6]:
# Selecting Low Variety Categorical Columns
currentCounts = {}
for col in full_train.columns:
    currentCounts[col] = []
    for val in full_train[col]:
        if val not in currentCounts[col]:
            currentCounts[col].append(val)
for k in currentCounts:
    currentCounts[k] = len(currentCounts[k])
print np.sort(currentCounts.values())
gen_columns = []
for k in currentCounts:
    if currentCounts[k] <= 1000:
        gen_columns.append(k)
gen_columns

[   2    2    2    3    3    3    3    3    4    4    4    4    4    4
    4    4    4    4    5    5    5    5    5    5    5    5    5    5
    5    5    5    6    6    6    6    6    6    6    6    7    7    7
    7    8    8    8    8    8    9    9    9   10   12   12   15   15
   16   20   21   24   25   61   76  112  120  144  178  202  274  335
  369  417  441  637  663  721  753  780  861 1073 1460]


['3SsnPorch',
 'MasVnrType',
 'LotConfig',
 'BsmtUnfSF',
 'GarageQual',
 'Exterior1st',
 'GarageFinish',
 '2ndFlrSF',
 'HalfBath',
 'Utilities',
 'OverallCond',
 'Fence',
 'Electrical',
 'SalePrice',
 'BsmtQual',
 'SaleCondition',
 'HouseStyle',
 'GrLivArea',
 'YearRemodAdd',
 'SaleType',
 'MasVnrArea',
 'Foundation',
 'ExterCond',
 'BsmtFullBath',
 'BsmtCond',
 'HeatingQC',
 'LotFrontage',
 'MiscVal',
 'CentralAir',
 'BsmtExposure',
 'OverallQual',
 'KitchenAbvGr',
 'BsmtFinSF1',
 'LotShape',
 'BsmtHalfBath',
 'TotalBsmtSF',
 'BsmtFinSF2',
 'TotRmsAbvGrd',
 'MiscFeature',
 'PavedDrive',
 'OpenPorchSF',
 'LandSlope',
 'GarageYrBlt',
 'PoolArea',
 'FullBath',
 'YrSold',
 'ScreenPorch',
 'Exterior2nd',
 'Heating',
 'MSSubClass',
 'PoolQC',
 'WoodDeckSF',
 '1stFlrSF',
 'YearBuilt',
 'KitchenQual',
 'BsmtFinType2',
 'BsmtFinType1',
 'Condition2',
 'Condition1',
 'GarageType',
 'LandContour',
 'RoofMatl',
 'Neighborhood',
 'Fireplaces',
 'BedroomAbvGr',
 'MoSold',
 'BldgType',
 'ExterQual',

In [7]:
full_s_fields = [
    'MSZoning',
    'Street',
    'Alley',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'OverallQual',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Heating',
    'HeatingQC',
    'CentralAir',
    'Electrical',
    'Electrical',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PavedDrive',
    'PoolQC',
    'Fence',
    'MiscFeature',
    'SaleType',
    'MSSubClass',
    'SaleCondition'
]
full_n_fields = [
    'LotFrontage',
    'LotArea',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'YrSold',
    'MoSold',
    'MiscVal',
]

s_fields = [ c for c in full_s_fields if not c in naCols and c in gen_columns]
n_fields = [c for c in full_n_fields if not c in naCols and c in gen_columns]

In [8]:
# Remove outliers greater than 3 standard deviations
new_train = full_train.copy()
to_drop = []
for r_idx, row in full_train.iterrows():
    for c_idx, val in enumerate(row):
        c_name = full_train.columns[c_idx]
        if c_name in n_fields:
            c_mean = full_train[c_name].mean()
            val_diff = np.abs(val - c_mean)
            c_std = full_train[c_name].std()
            if val_diff >= 3 * c_std:
                to_drop.append(r_idx)

train = new_train.drop(new_train.index[to_drop])
train.shape

(1056, 81)

In [9]:
# Set target and predictors
target = 'SalePrice'
predictors = [c for c in train.columns if not c == target and not c in naCols and c in gen_columns]

# Train/test split
X = train[predictors]
y = train[[target]]
X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=1)
X_test = test[predictors]

In [10]:
def make_csv(name, pred):
    new_df = test.copy()
    new_df['SalePrice'] = pred
    new_df[['Id', 'SalePrice']].to_csv(name, index=False)

In [12]:
def ensure_data_type(X):
    return X.apply(lambda col: col.astype(str))

def make_dictionaries(X):
    return X.to_dict(orient='records')

def select_categorical_features(X):
    return X[s_fields]

def select_numeric_features(X):
    return X[n_fields]

numeric_selector = FunctionTransformer(select_numeric_features, validate=False)
imp = Imputer(strategy='mean')

scf = FunctionTransformer(select_categorical_features, validate=False)
ed = FunctionTransformer(ensure_data_type, validate=False)
md = FunctionTransformer(make_dictionaries, validate=False)
fh = FeatureHasher(n_features=2000)


numeric_feature_pipeline = Pipeline(steps=[
    ('select', numeric_selector),
    ('impute', imp),
])

categorical_pipeline = Pipeline(steps=[
    ('select', scf),
    ('datatype', ed),
    ('dictionaries', md),
    ('vectorize', fh)
])
fu = FeatureUnion([
    ('categorical', categorical_pipeline),
    ('numeric', numeric_feature_pipeline),
])
fr = Pipeline([
    ('union', fu),
    ('reduction', TruncatedSVD(n_components=400)),
])
def root_mean_square_error(pred, actual):
    log_actual = np.log(actual)
    log_pred = np.log(pred)
    return np.sqrt(mean_squared_error(log_actual, log_pred))

In [20]:
# Ridge Log Sales Price
rid = Ridge()
search_params = {
    'preprocess__union__categorical__vectorize__n_features': [1000, 2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [200, 400],
    'predict__alpha': [10.0],
    'predict__fit_intercept': [False],
    'predict__solver': ["cholesky"],
    'predict__random_state': [1],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', rid)
])

round_num = 2
grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print grid_search.best_params_
print grid_search.score(X_dev, np.log(y_dev).round(round_num))
print root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num))
rid_pred = grid_search.predict(X_test)

  """


{'predict__fit_intercept': False, 'preprocess__reduction__n_components': 200, 'predict__random_state': 1, 'predict__alpha': 10.0, 'preprocess__union__categorical__vectorize__n_features': 2000, 'preprocess__union__numeric__impute__strategy': 'mean', 'predict__solver': 'cholesky'}
0.9157412510520612
0.009800922538193419


In [21]:
make_csv('logPrice_ridge.csv', np.exp(rid_pred))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
# Gradient Boosting

gb = GradientBoostingRegressor(subsample=.7, max_depth=6, learning_rate = .05, 
                               n_estimators=500, max_features='auto')

search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [100],
    'predict__n_estimators': [1000],
    'predict__max_depth': [6],
    'predict__learning_rate': [0.01],
    'predict__subsample': [0.7],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', gb)
])

grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print grid_search.best_params_
print grid_search.score(X_dev, np.log(y_dev).round(round_num))
print root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num))
gb_pred = grid_search.predict(X_test)

  """
  y = column_or_1d(y, warn=True)


{'predict__max_depth': 6, 'predict__n_estimators': 1000, 'preprocess__reduction__n_components': 100, 'predict__subsample': 0.7, 'preprocess__union__categorical__vectorize__n_features': 2000, 'preprocess__union__numeric__impute__strategy': 'mean', 'predict__learning_rate': 0.01}
0.8753608441530928
0.011782268808122283


In [18]:
make_csv('logPrice_GBTest.csv', np.exp(gb_pred))

In [17]:
import xgboost

xgb = XGBRegressor(objective='reg:linear')

search_params = {
    'preprocess__union__categorical__vectorize__n_features': [2000],
    'preprocess__union__numeric__impute__strategy': ['mean'],
    'preprocess__reduction__n_components': [100],
    'predict__n_estimators': [1000],
    'predict__max_depth': [6],
    'predict__learning_rate': [0.01],
    'predict__subsample': [0.7],
    'predict__objective': ['reg:linear'],
}
model_pipe = Pipeline(steps=[
    ('preprocess', fr),
    ('predict', xgb)
])

grid_search = GridSearchCV(model_pipe, search_params, cv=5)
grid_search.fit(X_train, np.log(y_train).round(round_num))
print grid_search.best_params_
print grid_search.score(X_dev, np.log(y_dev).round(round_num))
print root_mean_square_error(grid_search.predict(X_dev), np.log(y_dev).round(round_num))
xgb_pred = grid_search.predict(X_test)



  """


{'predict__max_depth': 6, 'predict__n_estimators': 1000, 'preprocess__union__categorical__vectorize__n_features': 2000, 'predict__subsample': 0.7, 'predict__objective': 'reg:linear', 'preprocess__reduction__n_components': 100, 'preprocess__union__numeric__impute__strategy': 'mean', 'predict__learning_rate': 0.01}
0.8699604705466604
0.012047315165966077


In [19]:
make_csv('logPrice_XGBTest.csv', np.exp(xgb_pred))