In [121]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [122]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0])


(1460, 81)
Null Counts:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [123]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id", "SalePrice"])
print(df.shape)

(1460, 75)


In [124]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [125]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 203)


In [126]:
def standardize_numerical_features(df):
    numerical_features = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

df = standardize_numerical_features(df)

In [127]:
scaler_y = StandardScaler()
y = scaler_y.fit_transform(df_orig.SalePrice.to_numpy().reshape(-1, 1))

In [128]:
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1168, 203) (1168, 1)
(292, 203) (292, 1)


In [129]:
regr = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid={
            'C': [7, 8, 9, 10],
            'epsilon': [0.02, 0.03, 0.04, 0.05],
            'gamma': [0.007, 0.0008, 0.0009, 0.001, 0.002]
        },
        cv=5,
        n_jobs=-1)
regr.fit(X_train, y_train)
print("Train score: ", regr.score(X_train, y_train))
print("Test score: ", regr.score(X_test, y_test))
print(regr.best_params_)

  y = column_or_1d(y, warn=True)


Train score:  0.9630688259360346
Test score:  0.8879766503285257
{'C': 9, 'epsilon': 0.03, 'gamma': 0.0009}


In [130]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = encode_categorical_features(df_test, onehot)
df_test = standardize_numerical_features(df_test)
predictions = regr.predict(df_test)
predictions = scaler_y.inverse_transform(predictions)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)