In [106]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# predictors
from sklearn import svm
from sklearn import linear_model

import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Step 1: Preprocessing

In [107]:
combine = [train, test]

In [108]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [136]:
train_desc = train.describe()
train.dtypes.to_dict()

{'Id': dtype('int64'),
 'MSSubClass': dtype('int64'),
 'MSZoning': dtype('O'),
 'LotFrontage': dtype('float64'),
 'LotArea': dtype('int64'),
 'Street': dtype('O'),
 'Alley': dtype('O'),
 'LotShape': dtype('O'),
 'LandContour': dtype('O'),
 'Utilities': dtype('O'),
 'LotConfig': dtype('O'),
 'LandSlope': dtype('O'),
 'Neighborhood': dtype('O'),
 'Condition1': dtype('O'),
 'Condition2': dtype('O'),
 'BldgType': dtype('O'),
 'HouseStyle': dtype('O'),
 'OverallQual': dtype('int64'),
 'OverallCond': dtype('int64'),
 'YearBuilt': dtype('int64'),
 'YearRemodAdd': dtype('int64'),
 'RoofStyle': dtype('O'),
 'RoofMatl': dtype('O'),
 'Exterior1st': dtype('O'),
 'Exterior2nd': dtype('O'),
 'MasVnrType': dtype('O'),
 'MasVnrArea': dtype('float64'),
 'ExterQual': dtype('O'),
 'ExterCond': dtype('O'),
 'Foundation': dtype('O'),
 'BsmtQual': dtype('O'),
 'BsmtCond': dtype('O'),
 'BsmtExposure': dtype('O'),
 'BsmtFinType1': dtype('O'),
 'BsmtFinSF1': dtype('int64'),
 'BsmtFinType2': dtype('O'),
 'BsmtF

Now looking at all features to find the best ones:

In [110]:
train_desc["MiscVal"]["count"]

1460.0

## Find the relevant features
Here we find the standard deviation of all features concerning the SalePrice.

Assumption: If a feature has a low standard deviation here, it correleates strongly with SalePrice and is thus important.

But we can only do this for features where the datatype is something else than double. Features with the datatype double naturally have a low standard deviation because most values are different for them.

We will not combine features for now.
So features with the following can be excluded:
- low count (where not many data rows have values for)
- many distinct values (but will be used later)
- high standard deviation mean

In [160]:
std_mean = pd.DataFrame(data={'feature':[], 'std_mean':[], 'weighted_std_mean':[], 'count':[], 'distinct_values':[], 'datatype':[]})
std_mean.set_index("feature")
for feature, datatype in train.dtypes.iteritems(): #select_dtypes(exclude=['float64','int64']):
    if feature == "Id":
        continue
    ov = train.groupby(feature)['SalePrice'].agg(['count','std'])
    # TODO: weighted mean
    mean = ov['std'].mean()
    ov["std_sum"] = ov["count"] * ov["std"]
    count = ov["count"].sum()
    distinct_values = ov.shape[0]
    weighted_mean =  ov["std_sum"].sum() / count
    std_mean = std_mean.append({"feature":feature,"std_mean":mean,"weighted_std_mean":weighted_mean,"count":count,"distinct_values":distinct_values,'datatype':datatype},ignore_index=True)

In [161]:
excluded_features = []
min_count = 750
max_distinct_values = 50
for feature, values in std_mean.iterrows():
    if values["count"] <= min_count or values["distinct_values"] >= max_distinct_values:
        excluded_features.append(values["feature"])
# remove rows where count is too low
std_mean = std_mean[std_mean["count"] > min_count]
# remove rows with too few distinct_values
std_mean = std_mean[std_mean["distinct_values"] < max_distinct_values]

print("Excluded features: {}".format(excluded_features))
std_mean.sort_values(by='weighted_std_mean').head(25)

Excluded features: ['LotFrontage', 'LotArea', 'Alley', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice']


Unnamed: 0,feature,std_mean,weighted_std_mean,count,distinct_values,datatype
16,OverallQual,49593.533067,40742.442685,1460.0,10.0,int64
80,Neighborhood_mean,48571.130846,49039.324204,1460.0,25.0,float64
11,Neighborhood,48571.130846,49039.324204,1460.0,25.0,object
60,GarageCars,54859.864044,52063.413232,1460.0,5.0,int64
52,KitchenQual,64945.913412,54517.157369,1460.0,4.0,object
26,ExterQual,67472.218149,54678.135679,1460.0,4.0,object
29,BsmtQual,62402.133566,55565.22164,1423.0,4.0,object
48,FullBath,96320.287756,61001.593173,1460.0,4.0,int64
53,TotRmsAbvGrd,86341.239777,62242.638262,1460.0,12.0,int64
59,GarageFinish,68998.447987,64593.201079,1379.0,3.0,object


**What is done here:**
- for every feature:
    - the values of the feature get grouped together and a new column for the standard deviation is created
    - the average of the standard deviation is listed as "std_mean" in the table above
    - the weighted mean of the standard deviation (taking the count of each value into account) is listed under "weighted_mean" above

Lower values in the "weighted_mean" column indicate more important features as the correlation between these features and the price is higher.

In the following we will look at the most important features and take them into account in the following.

## Neighborhood

In [118]:
# train[['Neighborhood', 'SalePrice']].groupby(['Neighborhood'], as_index=False).mean().sort_values(by='SalePrice', ascending=False)
ov = train.groupby('Neighborhood')['SalePrice'].agg(['count','mean','std']).sort_values(by='mean', ascending=False).reset_index()
ov.head()

Unnamed: 0,Neighborhood,count,mean,std
0,NoRidge,41,335295.317073,121412.65864
1,NridgHt,77,316270.623377,96392.544954
2,StoneBr,25,310499.0,112969.67664
3,Timber,38,242247.447368,64845.651549
4,Veenker,11,238772.727273,72369.317959


The standard deviatiation is quite low for some neighborhoods, so this gives a good hint of how high the price there is. Will replace the Neighborhood name by the mean for now. In some cases this feature is very good because the deviation is not that high.

In [119]:
# series to dict:
neighborhood_dict = ov[["Neighborhood","mean"]].set_index('Neighborhood').to_dict()['mean']

In [120]:
for dataset in combine:
    dataset['Neighborhood_mean'] = dataset['Neighborhood'].map(neighborhood_dict)

## Create the sets and normalize

In [123]:
# for now (simplicity):
features = ["LotArea","YearRemodAdd","PoolArea","OverallQual","OverallCond","Neighborhood_mean","GarageCars"]
X = train[features]
y = train["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Step 2: test different models and find the best one

## Linear SVR

In [126]:
clf = svm.SVR(kernel = 'linear')
clf.fit(X_train,y_train)
clf_predictions = clf.predict(X_test)
absolute_error = mean_absolute_error(y_test,clf_predictions)
squared_error = mean_squared_error(y_test,clf_predictions)
print(absolute_error)
print(squared_error)

31872.499419771044
2542250266.8572183


## Ridge Regression

In [124]:
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train,y_train)
reg_predictions = reg.predict(X_test)
absolute_error = mean_absolute_error(y_test,reg_predictions)
squared_error = mean_squared_error(y_test,reg_predictions)
print(absolute_error)
print(squared_error)

28946.226093075005
1752887928.4598505
