In [5]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# predictors
from sklearn import svm
from sklearn import linear_model

import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Step 1: Preprocessing

In [6]:
combine = [train, test]

In [7]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [77]:
train_desc = train.describe()
train.mean()

Id                      730.500000
MSSubClass               56.897260
LotFrontage              70.049958
LotArea               10516.828082
OverallQual               6.099315
OverallCond               5.575342
YearBuilt              1971.267808
YearRemodAdd           1984.865753
MasVnrArea              103.685262
BsmtFinSF1              443.639726
BsmtFinSF2               46.549315
BsmtUnfSF               567.240411
TotalBsmtSF            1057.429452
1stFlrSF               1162.626712
2ndFlrSF                346.992466
LowQualFinSF              5.844521
GrLivArea              1515.463699
BsmtFullBath              0.425342
BsmtHalfBath              0.057534
FullBath                  1.565068
HalfBath                  0.382877
BedroomAbvGr              2.866438
KitchenAbvGr              1.046575
TotRmsAbvGrd              6.517808
Fireplaces                0.613014
GarageYrBlt            1978.506164
GarageCars                1.767123
GarageArea              472.980137
WoodDeckSF          

Now looking at all features to find the best ones:

In [55]:
train_desc["MiscVal"]["count"]

1460.0

## Find the relevant features

### Looking at all features
Here we find the standard deviation of all features concerning the SalePrice.

Assumption: If a feature has a low standard deviation here, it correleates strongly with SalePrice and is thus important.

In [80]:
std_mean = pd.DataFrame(data={'feature':[], 'std_mean':[], 'count':[]})
std_mean.set_index("feature")
print(std_mean)
for feature in train.columns:
    if feature == "Id":
        continue
    ov = train.groupby(feature)['SalePrice'].agg(['count','std'])
    # TODO: weighted mean
    mean = ov['std'].mean()
    count = train[feature].count()
    std_mean = std_mean.append({"feature":feature,"std_mean":mean,"count":count},ignore_index=True)
std_mean.sort_values(by='std_mean').head(25)

Empty DataFrame
Columns: [feature, std_mean, count]
Index: []


Unnamed: 0,feature,std_mean,count
79,SalePrice,0.0,1460.0
44,LowQualFinSF,30323.979345,1460.0
3,LotArea,33415.205623,1460.0
45,GrLivArea,34078.409223,1460.0
5,Alley,36575.577988,91.0
43,2ndFlrSF,36802.833769,1460.0
51,KitchenAbvGr,38899.522111,1460.0
38,Heating,39138.061841,1460.0
63,GarageCond,41581.133819,1379.0
61,GarageArea,42072.13454,1460.0


Lower values here indicate a lower mean in the feature concerning the standard deviation. We will now look at 

### Neighborhood

In [13]:
# train[['Neighborhood', 'SalePrice']].groupby(['Neighborhood'], as_index=False).mean().sort_values(by='SalePrice', ascending=False)
ov = train.groupby('Neighborhood')['SalePrice'].agg(['count','mean','std']).sort_values(by='mean', ascending=False).reset_index()
print(ov)

   Neighborhood  count           mean            std
0       NoRidge     41  335295.317073  121412.658640
1       NridgHt     77  316270.623377   96392.544954
2       StoneBr     25  310499.000000  112969.676640
3        Timber     38  242247.447368   64845.651549
4       Veenker     11  238772.727273   72369.317959
5       Somerst     86  225379.837209   56177.555888
6       ClearCr     28  212565.428571   50231.538993
7       Crawfor     51  210624.725490   68866.395472
8       CollgCr    150  197965.773333   51403.666438
9       Blmngtn     17  194870.882353   30393.229219
10      Gilbert     79  192854.506329   35986.779085
11       NWAmes     73  189050.068493   37172.218106
12      SawyerW     59  186555.796610   55651.997820
13      Mitchel     49  156270.122449   36486.625334
14        NAmes    225  145847.080000   33075.345450
15      NPkVill      9  142694.444444    9377.314529
16        SWISU     25  142591.360000   32622.917679
17      Blueste      2  137500.000000   19091.

The standard deviatiation is quite low for some neighborhoods, so this gives a good hint of how high the price there is. Will replace the Neighborhood name by the mean for now. In some cases this feature is very good because the deviation is not that high.

In [14]:
# series to dict:
neighborhood_dict = ov[["Neighborhood","mean"]].set_index('Neighborhood').to_dict()['mean']

In [15]:
for dataset in combine:
    dataset['Neighborhood_mean'] = dataset['Neighborhood'].map(neighborhood_dict)

    MoSold  count           mean            std
0        9     63  195683.206349   83149.017391
1       11     79  192210.911392   83517.711202
2       12     59  186518.966102   69495.155055
3        7    234  186331.192308   91772.125395
4        8    122  184651.827869   73215.986647
5        1     58  183256.258621  121381.083473
6        3    106  183253.924528   86488.445599
7       10     89  179563.977528   75736.008303
8        2     52  177882.000000   52960.863135
9        6    253  177395.735178   69453.085881
10       5    204  172307.269608   68614.545198
11       4    141  171503.262411   77147.323106


## Create the sets and normalize

In [125]:
# for now (simplicity):
features = ["LotArea","YearRemodAdd","PoolArea","OverallQual","OverallCond","Neighborhood_mean"]
X = train[features]
y = train["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Step 2: test different models and find the best one

## Linear SVR

In [126]:
clf = svm.SVR(kernel = 'linear')
clf.fit(X_train,y_train)
clf_predictions = clf.predict(X_test)
absolute_error = mean_absolute_error(y_test,clf_predictions)
squared_error = mean_squared_error(y_test,clf_predictions)
print(absolute_error)
print(squared_error)

31872.499419771044
2542250266.8572183


## Ridge Regression

In [128]:
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train,y_train)
reg_predictions = reg.predict(X_test)
absolute_error = mean_absolute_error(y_test,reg_predictions)
squared_error = mean_squared_error(y_test,reg_predictions)
print(absolute_error)
print(squared_error)

27587.501003060384
1668361830.5254564
