A Primer for the Zillow Award - Competition Prediction Approach
=====================================================

In [None]:
import pandas as pd; pd.options.mode.chained_assignment = None
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.preprocessing import LabelEncoder
import time; start_time = time.time()
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import datetime as dt
import xgboost as xgb
import numpy as np
import random
import math

random.seed(17)
np.random.seed(17)

train = pd.read_csv("../input/train_2016.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2016.csv")
submission = pd.read_csv("../input/sample_submission.csv")

print(len(train),len(properties),len(submission))

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(0.0)
    return df

def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

train = pd.merge(train, properties, how='left', on='parcelid')
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

col = [c for c in train.columns if c not in ['logerror']]

cat = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O']
lbl = LabelEncoder()
for c in cat:
    lbl.fit(list(train[c].fillna('').values)+list(test[c].fillna('').values))
    train[c] = lbl.transform(list(train[c].fillna('').values))
    test[c] = lbl.transform(list(test[c].fillna('').values))
    print(c, len(lbl.classes_))

train = get_features(train)
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test)
col = [c for c in train.columns if c not in ['logerror']]

reg = PassiveAggressiveRegressor(warm_start=True, random_state=123)
reg.fit(train[col], train['logerror']); print('fit...')
print(MAE(train['logerror'], reg.predict(train[col])))
train = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

#limiting to 2 for Kernel memory
for i in range(2): #range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    test = get_features(test)
    submission[test_columns[i]] = reg.predict(test[col])
    print('predict...', i)
submission.to_csv('submission.csv', index=False, float_format='%.4f')