In [197]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform

In [198]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0].sort_values())


(1460, 81)
Null Counts:
Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtExposure      38
BsmtFinType2      38
GarageCond        81
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64


In [199]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id", "SalePrice"])
print(df.shape)

(1460, 75)


In [200]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [201]:
def to_categorical(df):
    df = df.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                           50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                           80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                           150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                           "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                       7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                          })
    return df


df = to_categorical(df)
print(df[["MSSubClass", "MoSold"]].head())

  MSSubClass MoSold
0       SC60    Feb
1       SC20    May
2       SC60    Sep
3       SC70    Feb
4       SC60    Dec


In [202]:
def standardize_numerical_features(df):
    numerical_features = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

df = standardize_numerical_features(df)

In [203]:
# Convert categorical to ordinal
def to_ordinal(df, feature, categories):
    ordinal_encoder = OrdinalEncoder([categories])
    df[feature] = ordinal_encoder.fit_transform(df[[feature]])


def to_ordinal_all(df):
    to_ordinal(df, "Street", ["Grvl", "Pave"])
    to_ordinal(df, "Utilities", ["NoSeWa", "AllPub"])
    to_ordinal(df, "LandSlope", ['Gtl', 'Mod', 'Sev'])
    to_ordinal(df, "ExterQual", ['Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "ExterCond", ['Po','Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "BsmtQual", ['Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "BsmtCond", ['Po', 'Fa', 'TA', 'Gd'])
    to_ordinal(df, "BsmtExposure", ['No', 'Mn', 'Av', 'Gd'])
    to_ordinal(df, "BsmtFinType1", ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])
    to_ordinal(df, "HeatingQC", ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "BsmtFinType2", ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'])
    to_ordinal(df, "Functional", ['Sev', 'Maj1', 'Maj2', 'Mod', 'Min2', 'Min1', 'Typ'])
    to_ordinal(df, "FireplaceQu", ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "KitchenQual", ['Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "GarageQual", ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "GarageFinish", ['Unf', 'RFn', 'Fin'])
    to_ordinal(df, "GarageCond",  ['Po', 'Fa', 'TA', 'Gd', 'Ex'])
    to_ordinal(df, "PavedDrive",  ['N', 'P', 'Y'])
    
    
to_ordinal_all(df)

In [204]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 217)


In [205]:
X = df
y = df_orig.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 217) (1168,)
(292, 217) (292,)


In [206]:
regr = RandomizedSearchCV(
        n_iter=20,
        random_state=1,
        estimator=SVR(kernel='rbf'),
        param_distributions={
            "gamma": uniform(0.001, 0.01),
            "C": uniform(100000, 200000),
            'epsilon': uniform(1000, 20000),
        },
        cv=5,
        n_jobs=-1,
        verbose=100,
        scoring="neg_mean_squared_log_error"
        )
regr.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to new file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=ob

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.8s
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a

Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    2.2s
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.2s
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a

Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.4s
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(935,), dtype=int32).
Pickling array (shape=(233,), dtype=int32).
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    3.5s
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\

Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(1168,), dtype=int64).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(934,), dtype=int32).
Pickling array (shape=(234,), dtype=int32).
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64).
Memmapping (shape=(217, 1168), dtype=float64) to old file C:\Users\aubadmin\AppData\Local\Temp\joblib_memmapping_folder_8688_2528389046\8688-280488032-a4717bcece8e4d3f93af7a3b58fb41b7.pkl
Pickling array (shape=(217,), dtype=object).
Pickling array (shape=(1168,), dtype=int64)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=20, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000016131710>,
                                        'epsilon': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000016131518>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000161312B0>},
                   pre_dispatch='2*n_jobs', random_state=1, refit=True,
                   return_train_score=False,
                   scoring='neg_mean_squared_log_error', verbose=100)

In [207]:
print("Train score: ", np.sqrt(-regr.score(X_train, y_train)))
print("Test score: ", np.sqrt(-regr.score(X_test, y_test)))
print(regr.best_params_)

Train score:  0.10583198840720653
Test score:  0.1345613638446926
{'C': 160466.51452636794, 'epsilon': 3935.117816342261, 'gamma': 0.0019233859476879781}


In [208]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = to_categorical(df_test)
df_test = standardize_numerical_features(df_test)
to_ordinal_all(df_test)
df_test = encode_categorical_features(df_test, onehot)
predictions = regr.predict(df_test)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)