In [1]:
import os
import pandas as pd
os.getcwd()
os.chdir('/home/ghk829/zillow')
print('Loading Properties ...')
properties2016 = pd.read_csv('./properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('./properties_2017.csv', low_memory = False)

print('Loading Train ...')
train2016 = pd.read_csv('./train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('./train_2017.csv', parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv('./sample_submission.csv', low_memory=False)
properties = pd.read_csv('./properties_2016.csv', low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

Loading Properties ...
Loading Train ...


In [2]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
    return df

In [3]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

In [4]:
import numpy as np
print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

print('Tax Features 2017  ...')
train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan

print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(test_df[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')


Merge Train with Properties ...
Tax Features 2017  ...
Concat Train 2016 & 2017 ...


In [5]:
import gc
gc.collect()

75

In [6]:
print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

del num_rows, missing_perc_thresh
gc.collect();

Remove missing data fields ...
We exclude: 15


In [7]:
print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))


Remove features with one unique value !!
We exclude: 9


In [8]:
print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

Define training features !!
We use these for training: 43


In [15]:
print ("Replacing NaN values by 0 !!")
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)


print ("remove outliers")
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.419 ]

train_df=train_df.assign(diff_cal_fin=lambda x: x.calculatedfinishedsquarefeet-x.finishedfloor1squarefeet)
new_cols=['finishedfloor1squarefeet',
'garagetotalsqft',
'lotsizesquarefeet',
'poolsizesum']
for i,value in enumerate(new_cols):
    print(str(i)+":"+value)
    train_df=eval("train_df.assign(new_{}=lambda x: x.{} /x.calculatedfinishedsquarefeet)".format(value,value))

VAL_SPLIT_DATE = '2016-09-15'   # Cutoff date for validation split
select_qtr4 = train_df["transactiondate"] >= VAL_SPLIT_DATE
valid = train_df[select_qtr4]
train = train_df[~select_qtr4]
valid.drop(["transactiondate"], inplace=True, axis=1)
train.drop(["transactiondate"], inplace=True, axis=1)
print("Train: ", train.shape)
print("Test: ", valid.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)

Replacing NaN values by 0 !!
remove outliers
0:finishedfloor1squarefeet
1:garagetotalsqft
2:lotsizesquarefeet
3:poolsizesum


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


('Train: ', (74478, 68))
('Test: ', (89999, 68))


In [10]:
train_features.remove('transactiondate')
print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])


Define categorial features !!
Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear']


In [11]:
train_features_new = train_features+['finishedfloor1squarefeet',
'garagetotalsqft',
'lotsizesquarefeet',
'poolsizesum','diff_cal_fin']

In [12]:
from catboost import CatBoostRegressor
from tqdm import tqdm
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [17]:
X_train = train[train_features_new]
y_train = train.logerror
X_valid = valid[train_features_new]
y_valid = valid.logerror
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

((74478, 47), (74478,))
((89999, 47), (89999,))


In [18]:
num_ensembles = 2
tree_counts = []
MAEs = []
for i in range(num_ensembles):
    # TODO(you): Use CV, tune hyperparameters
    model = CatBoostRegressor(
        iterations=10, learning_rate=0.004,
        depth=6, l2_leaf_reg=15,
        bagging_temperature=8,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        eval_set=[X_valid, y_valid],
        cat_features=cat_feature_inds,
#        verbose=True,
        use_best_model=True
        )
    tree_counts.append( model.tree_count_ )
    MAEs.append( mean_absolute_error(y_valid, model.predict(X_valid)) )   

In [20]:
test_df=test_df.assign(diff_cal_fin=lambda x: x.calculatedfinishedsquarefeet-x.finishedfloor1squarefeet)
new_cols=['finishedfloor1squarefeet',
'garagetotalsqft',
'lotsizesquarefeet',
'poolsizesum']
for i,value in enumerate(new_cols):
    print(str(i)+":"+value)
    test_df=eval("test_df.assign(new_{}=lambda x: x.{} /x.calculatedfinishedsquarefeet)".format(value,value))

0:finishedfloor1squarefeet
1:garagetotalsqft
2:lotsizesquarefeet
3:poolsizesum


In [21]:
X_test=test_df[train_features_new]
y_pred=model.predict(X_test)

In [22]:
submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201610': pd.Timestamp('2016-10-01'),
    '201611': pd.Timestamp('2016-11-01'),
    '201612': pd.Timestamp('2016-12-01'),
    '201710': pd.Timestamp('2017-10-01'),
    '201711': pd.Timestamp('2017-11-01'),
    '201712': pd.Timestamp('2017-12-02')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred

Predicting for: 201612 ... 
Predicting for: 201610 ... 
Predicting for: 201611 ... 
Predicting for: 201712 ... 
Predicting for: 201711 ... 
Predicting for: 201710 ... 


In [26]:
submission.to_csv('./final_solution_0.csv', float_format='%.6f',index=False)

IOError: [Errno 28] No space left on device