In [358]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

In [359]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
null_counts = df_orig.isnull().sum()
print(null_counts[null_counts > 0].sort_values())


(1460, 81)
Null Counts:
Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtExposure      38
BsmtFinType2      38
GarageCond        81
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64


In [360]:
df = df_orig.drop(columns=["Id", "SalePrice"])
print(df.shape)

(1460, 79)


In [361]:
def fill_nulls(df):
    df["PoolQC"].fillna("None", inplace=True)
    df["MiscFeature"].fillna("None", inplace=True)
    df["Alley"].fillna("None", inplace=True)
    df["Fence"].fillna("None", inplace=True)
    df["FireplaceQu"].fillna("None", inplace=True)
    fill_median = lambda x: x.fillna(x.median())
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].apply(fill_median)
    df["GarageCond"].fillna("None", inplace=True)
    df["GarageQual"].fillna("None", inplace=True)
    df["GarageFinish"].fillna("None", inplace=True)
    df["GarageType"].fillna("None", inplace=True)
    df["GarageYrBlt"].fillna(0, inplace=True)
    df["BsmtQual"].fillna("None", inplace=True)
    df["BsmtCond"].fillna("None", inplace=True)
    df["BsmtFinType1"].fillna("None", inplace=True)
    df["BsmtExposure"].fillna("None", inplace=True)
    df["BsmtFinType2"].fillna("None", inplace=True)
    df["MasVnrType"].fillna(df["MasVnrType"].mode()[0], inplace=True)
    df["MasVnrArea"].fillna(0, inplace=True)
    df["Electrical"].fillna(df["Electrical"].mode()[0], inplace=True)
    df["Utilities"].fillna(df["Utilities"].mode()[0], inplace=True)
    df["Functional"].fillna("Typ", inplace=True)
    df["KitchenQual"].fillna(df["KitchenQual"].mode()[0], inplace=True)
    df["Exterior1st"].fillna(df["Exterior1st"].mode()[0], inplace=True)
    df["Exterior2nd"].fillna(df["Exterior2nd"].mode()[0], inplace=True)
    df["SaleType"].fillna(df["SaleType"].mode()[0], inplace=True)
    df["MSSubClass"].fillna(df["MSSubClass"].mode()[0], inplace=True)
    df["GarageYrBlt"].fillna(0, inplace=True)
    df["GarageArea"].fillna(0, inplace=True)
    df["GarageCars"].fillna(0, inplace=True)
    df["BsmtFinSF1"].fillna(0, inplace=True)
    df["BsmtFinSF2"].fillna(0, inplace=True)
    df["BsmtUnfSF"].fillna(0, inplace=True)
    df["TotalBsmtSF"].fillna(0, inplace=True)
    df["BsmtFullBath"].fillna(0, inplace=True)
    df["BsmtHalfBath"].fillna(0, inplace=True)
    df["MSZoning"].fillna(df["MSZoning"].mode()[0], inplace=True)
    
    
fill_nulls(df)
print("Null Counts:")
null_counts = df.isnull().sum()
print(null_counts[null_counts > 0].sort_values())

Null Counts:
Series([], dtype: int64)


In [362]:
def numerical_to_categorical(df):
    df = df.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                           50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                           80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                           150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                        "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                       7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                          })
    return df


df = numerical_to_categorical(df)
print(df[["MSSubClass", "MoSold"]].head())

  MSSubClass MoSold
0       SC60    Feb
1       SC20    May
2       SC60    Sep
3       SC70    Feb
4       SC60    Dec


In [363]:
def standardize_numerical_features(df):
    numerical_features = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df


df = standardize_numerical_features(df)

In [364]:
def to_ordinal(df, feature, categories):
    ordinal_encoder = OrdinalEncoder([categories])
    df[feature] = ordinal_encoder.fit_transform(df[[feature]])


def categorical_to_ordinal(df):
    to_ordinal(df, "Alley", ["None", "Grvl", "Pave"])
    to_ordinal(df, "Fence", ["None", "MnWw", "GdWo", "MnPrv", "GdPrv"])
    to_ordinal(df, "Street", ["Grvl", "Pave"])
    to_ordinal(df, "Utilities", ["NoSeWa", "AllPub"])
    to_ordinal(df, "LandSlope", ["Gtl", "Mod", "Sev"])
    to_ordinal(df, "ExterQual", ["Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "ExterCond", ["Po","Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "BsmtQual", ["None", "Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "BsmtCond", ["None", "Po", "Fa", "TA", "Gd"])
    to_ordinal(df, "BsmtExposure", ["None", "No", "Mn", "Av", "Gd"])
    to_ordinal(df, "BsmtFinType1", ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"])
    to_ordinal(df, "BsmtFinType2", ["None", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"])
    to_ordinal(df, "HeatingQC", ["Po", "Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "Functional", ["Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"])
    to_ordinal(df, "FireplaceQu", ["None", "Po", "Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "KitchenQual", ["Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "GarageQual", ["None", "Po", "Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "GarageFinish", ["None", "Unf", "RFn", "Fin"])
    to_ordinal(df, "GarageCond",  ["None", "Po", "Fa", "TA", "Gd", "Ex"])
    to_ordinal(df, "PavedDrive",  ["N", "P", "Y"])
    
    
categorical_to_ordinal(df)

In [365]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 225)


In [366]:
X = df
y = df_orig.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 225) (1168,)
(292, 225) (292,)


In [367]:
regr = RandomizedSearchCV(
        n_iter=20,
        random_state=1,
        estimator=SVR(kernel='rbf'),
        param_distributions={
            "gamma": uniform(0.001, 0.01),
            "C": uniform(100000, 100000),
            'epsilon': uniform(1000, 10000),
        },
        cv=5,
        n_jobs=-1,
        verbose=100,
        scoring="neg_mean_squared_log_error"
        )
regr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to new file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int64).
Pickling array (shape=(234,), dtype=int64).
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).


Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.6min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int64).
Pickling array (shape=(234,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.6min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array 

Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed:  2.0min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int64).
Pickling array (shape=(234,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.1min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array 

Pickling array (shape=(935,), dtype=int64).
Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int64).
Pickling array (shape=(234,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed:  2.7min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.1min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int64).
Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:  3.1min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array

Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int64).
Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:  3.5min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int64).
Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:  3.5min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef

[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  3.8min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int64).
Pickling array (shape=(233,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:  3.9min
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(225, 1168), dtype=float64) to old file /tmp/joblib_memmapping_folder_5213_8921051590/5213-140624014258360-02ecd1ef827d4edfb97f7f05a2faaaa8.pkl
Pickling array (shape=(225,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=20, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe594936e10>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe594936b38>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe594938d68>},
                   pre_dispatch='2*n_jobs', random_state=1, refit=True,
                   return_train_score=False,
                   scoring='neg_mean_squared_log_error', verbose=100)

In [368]:
print("Train score: ", np.sqrt(-regr.score(X_train, y_train)))
print("Test score: ", np.sqrt(-regr.score(X_test, y_test)))
print(regr.best_params_)

Train score:  0.0875988238309634
Test score:  0.13258192529959778
{'C': 167883.5532939891, 'epsilon': 3116.2811600005903, 'gamma': 0.0036554665937222623}


In [369]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["Id"])
fill_nulls(df_test)
df_test = numerical_to_categorical(df_test)
df_test = standardize_numerical_features(df_test)
categorical_to_ordinal(df_test)
df_test = encode_categorical_features(df_test, onehot)
predictions = regr.predict(df_test)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)