In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
train_df = pd.read_csv("train_2016_v2.csv", parse_dates=["transactiondate"])

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90275 entries, 0 to 90274
Data columns (total 3 columns):
parcelid           90275 non-null int64
logerror           90275 non-null float64
transactiondate    90275 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 2.1 MB


In [4]:
test_df = pd.read_csv("sample_submission.csv")

In [5]:
test_df = test_df.rename(index=str, columns={"ParcelId": "parcelid"})

In [6]:
properties_2016_df = pd.read_csv("properties_2016.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
train_df = pd.merge(train_df, properties_2016_df, on='parcelid', how='left')

In [8]:
test_df = pd.merge(test_df, properties_2016_df, on='parcelid', how='left')

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2985217 entries, 0 to 2985216
Data columns (total 64 columns):
parcelid                        int64
201610                          int64
201611                          int64
201612                          int64
201710                          int64
201711                          int64
201712                          int64
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6  

In [10]:
# to make this notebook's output identical at every run
np.random.seed(42)

### Prepare the data for Machine Learning algorithms

In [11]:
#remove categorical variables
train_df = train_df.drop(['propertyzoningdesc', 'propertycountylandusecode', 'transactiondate', 'parcelid'], axis = 1)

In [12]:
train_df = train_df.drop(['taxdelinquencyflag', 'fireplaceflag'], axis = 1)

In [13]:
test_df = test_df.drop(['propertyzoningdesc', 'propertycountylandusecode', 
                        'parcelid','taxdelinquencyflag', 'fireplaceflag'], axis = 1)

In [14]:
num_attribs = list(train_df)
num_attribs

['logerror',
 'airconditioningtypeid',
 'architecturalstyletypeid',
 'basementsqft',
 'bathroomcnt',
 'bedroomcnt',
 'buildingclasstypeid',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'decktypeid',
 'finishedfloor1squarefeet',
 'calculatedfinishedsquarefeet',
 'finishedsquarefeet12',
 'finishedsquarefeet13',
 'finishedsquarefeet15',
 'finishedsquarefeet50',
 'finishedsquarefeet6',
 'fips',
 'fireplacecnt',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'hashottuborspa',
 'heatingorsystemtypeid',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'poolsizesum',
 'pooltypeid10',
 'pooltypeid2',
 'pooltypeid7',
 'propertylandusetypeid',
 'rawcensustractandblock',
 'regionidcity',
 'regionidcounty',
 'regionidneighborhood',
 'regionidzip',
 'roomcnt',
 'storytypeid',
 'threequarterbathnbr',
 'typeconstructiontypeid',
 'unitcnt',
 'yardbuildingsqft17',
 'yardbuildingsqft26',
 'yearbuilt',
 'numberofstories',
 'structuretaxvaluedollarcnt',
 'taxvaluedollarcnt',
 'asses

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [16]:
train_df_labels = train_df['logerror'].values

In [17]:
train_df = train_df.drop(['logerror'], axis = 1)

In [29]:
train_df_labels

array([ 0.0276, -0.1684, -0.004 , ..., -0.2679,  0.0602,  0.4207])

In [18]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split = \
    train_test_split(train_df, train_df_labels, test_size=0.3, random_state=42)



In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, confusion_matrix, recall_score
from sklearn.ensemble import RandomForestRegressor

num_attribs = list(train_df)

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),        
        ('KBest', SelectKBest(k = 10)),
        ('pca', PCA(n_components = 5)),
        ('reg', RandomForestRegressor(random_state=42))
         ])
        

In [20]:
param_grid = dict(KBest__k = range(1,35),
                  pca__n_components=range(1, 20),
                  reg__min_samples_split = range(1,100))

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
sss = StratifiedShuffleSplit(100, test_size=0.5, random_state=42)

grid_search = GridSearchCV(num_pipeline, param_grid=param_grid, scoring='neg_mean_squared_error', cv = sss)

In [30]:
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
sss = StratifiedShuffleSplit(100, test_size=0.5, random_state=42)
param_distribs = {
    'KBest__k' : randint(low=1, high=53),
    'pca__n_components' : randint(low=1, high=30),
    'reg__min_samples_split': randint(low=1, high=100)
    }

In [33]:
from sklearn.model_selection import RandomizedSearchCV
#Try from sklearn.cross_validation import StratifiedKFold... keeps balancing constant

rnd_search = RandomizedSearchCV(num_pipeline, param_distributions=param_distribs,
                                n_iter=5, cv=5, scoring='neg_mean_squared_error', random_state=42)

In [34]:
rnd_search_fit = rnd_search.fit(X_train, y_train)

  f = msb / msw


In [35]:
rnd_search.best_params_

{u'KBest__k': 21, u'pca__n_components': 7, u'reg__min_samples_split': 83}

In [36]:
best_parameters = rnd_search.best_params_

In [37]:
rnd_search.best_estimator_

Pipeline(steps=[(u'selector', DataFrameSelector(attribute_names=['airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid', 'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'fi...estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False))])

In [38]:
final_model = rnd_search.best_estimator_

In [39]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.163406453547 {u'KBest__k': 39, u'pca__n_components': 20, u'reg__min_samples_split': 93}
0.163378292733 {u'KBest__k': 15, u'pca__n_components': 11, u'reg__min_samples_split': 72}
0.162737089699 {u'KBest__k': 21, u'pca__n_components': 7, u'reg__min_samples_split': 83}
0.163112535743 {u'KBest__k': 23, u'pca__n_components': 11, u'reg__min_samples_split': 75}
0.166514939282 {u'KBest__k': 24, u'pca__n_components': 21, u'reg__min_samples_split': 24}


In [43]:
from sklearn.externals import joblib

joblib.dump(final_model, "final_model_zillow5.pkl")

[u'final_model_zillow5.pkl']

In [44]:
joblib.dump(best_parameters, "best_param_zillow5.pkl")

[u'best_param_zillow5.pkl']

In [None]:
RF_rand_final_predictions = final_model.predict(test_df)

In [153]:
submission_file = pd.read_csv('sample_submission.csv') 
for column in submission_file.columns[submission_file.columns != 'ParcelId']:
    submission_file[column] = RF_rand_final_predictions

In [154]:
submission_file.to_csv('RF_final_predictions.csv', index=False, float_format='%.4f')