In [125]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [126]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0])


(1460, 81)
Null Counts:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [127]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id",])
print(df.shape)

(1460, 76)


In [128]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [129]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 204)


In [130]:
def standardize_numerical_features(df):
    numerical_features = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

df = standardize_numerical_features(df)

In [131]:
y = df.SalePrice
X = df.drop(columns=["SalePrice"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1095, 203) (1095,)
(365, 203) (365,)


In [132]:
regr = SVR(gamma=0.0001, C=10.0, epsilon=0.001, kernel="rbf")
#regr = SVR(gamma="scale", C=1.0, epsilon=0.2, kernel="rbf")
regr.fit(X_train, y_train)
print("Train score: ", regr.score(X_train, y_train))
print("Test score: ", regr.score(X_test, y_test))

Train score:  0.877479044783482
Test score:  0.868113133100851


In [133]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = encode_categorical_features(df_test, onehot)
df_test = standardize_numerical_features(df_test)
predictions = regr.predict(df_test)
print(predictions)
#df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
#df_submit.to_csv('submission.csv', index=False)

[-0.65140524 -0.14765323  0.09547955 ...  0.06193185 -0.79110928
  0.64962515]
