In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
from tqdm import tqdm

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "./input"]).decode("utf8"))

train_df = pd.read_csv('./input/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
test_df = pd.read_csv('./input/sample_submission.csv', low_memory=False)
properties = pd.read_csv('./input/properties_2016.csv', low_memory=False)
# field is named differently in submission
test_df['parcelid'] = test_df['ParcelId']

# similar to the1owl
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df
train_df = add_date_features(train_df)
#test_df = add_date_features(test_df)

In [None]:
def add_pro_features(prop_df):
    zip_count = prop_df['regionidzip'].value_counts().to_dict()
    city_count = prop_df['regionidcity'].value_counts().to_dict()
    prop_df['N-zip_count'] = prop_df['regionidzip'].map(zip_count)
    prop_df['N-city_count'] = prop_df['regionidcity'].map(city_count)
    prop_df['N-GarPoolAC'] = ((prop_df['garagecarcnt']>0) & 
    (prop_df['pooltypeid10']>0) & (prop_df['airconditioningtypeid']!=5))*1
    
    prop_df['N-ValueRatio'] = prop_df['taxvaluedollarcnt']/prop_df['taxamount']
    prop_df['N-LivingAreaProp'] = prop_df['calculatedfinishedsquarefeet']/prop_df['lotsizesquarefeet']
    prop_df['N-ValueProp'] = prop_df['structuretaxvaluedollarcnt']/prop_df['landtaxvaluedollarcnt']
    group = prop_df.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
    prop_df['N-Avg-structuretaxvaluedollarcnt'] = prop_df['regionidcity'].map(group)

    #Deviation away from average

    prop_df['N-Dev-structuretaxvaluedollarcnt'] = abs((prop_df['structuretaxvaluedollarcnt'] - prop_df['N-Avg-structuretaxvaluedollarcnt']))/prop_df['N-Avg-structuretaxvaluedollarcnt']
    prop_df['N-TaxScore'] = prop_df['taxvaluedollarcnt']*prop_df['taxamount']

    #Number of properties in the zip
    zip_count = prop_df['regionidzip'].value_counts().to_dict()
    prop_df['N-zip_count'] = prop_df['regionidzip'].map(zip_count)

    #Number of properties in the city
    city_count = prop_df['regionidcity'].value_counts().to_dict()
    prop_df['N-city_count'] = prop_df['regionidcity'].map(city_count)

    #Number of properties in the city
    region_count = prop_df['regionidcounty'].value_counts().to_dict()
    prop_df['N-county_count'] = prop_df['regionidcounty'].map(city_count)


    return prop_df

properties = add_pro_features(properties)

In [None]:
train_df = train_df.merge(properties, how='left', on='parcelid')
# train_df= train_df[ train_df.logerror > -0.4]
# train_df= train_df[ train_df.logerror < 0.419]
test_df = test_df.merge(properties, how='left', on='parcelid')
print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

In [None]:
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

# exclude where we only have one unique value :D
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

exclude_other = ['parcelid', 'logerror']  # for indexing/training only
# do not know what this is LARS, 'SHCG' 'COR2YY' 'LNR2RPD-R3' ?!?
exclude_other.append('propertyzoningdesc')
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

In [None]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

# some out of range int is a good choice
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

X_train = train_df[train_features]
y_train = train_df.logerror


sel_month = X_train.transaction_month >= 10
# X_val = X_train[sel_month]
# y_val = y_train[sel_month]
# X_train = X_train[~sel_month]
# y_train = y_train[~sel_month]

print(X_train.shape, y_train.shape)

In [None]:
# from sortedcontainers import SortedList
# import copy
# import collections
# import numpy as np
# from itertools import product,chain
# import pandas
# from sklearn.model_selection import KFold
# import catboost as cb

# ''' a class for doing grid search on a set of parameters provided in a dict. 'pdict' should be a dictionary like the following:
# pdict = {'depth':[1,2], 'iterations':[250,100,500], 'thread_count':4}

# when grid_search is called it will return an iterator that provides samples from the dictionary e.g.
# {'depth':1, 'iterations':250, 'thread_count':4}
# {'depth':2, 'iterations':250, 'thread_count':4}
# {'depth':1, 'iterations':100, 'thread_count':4}
# etc.
# after calling an iteration of grid_search, you need to test the classifier and run 'register_result'
# This will update the internal list of results, so that the next call to grid_search will use the best
# parameters for all the parameters not currently being updated.

# grid_search can be provided a list e.g. grid_search(['depth']) this will use the current best parameters for all
# the other arguments and only search over 'depth'. You can then call e.g. grid_search(['iterations']) and it will use
# the best depth found previously and cycle through all the 'iterations'. Searching incrementally can be much faster
# than doing a full grid search, but may miss the global optimum. '''
# class Paramsearch:
#     def __init__(self,pdict):    
#         self.pdict = {}
#         # if something is not passed in as a sequence, make it a sequence with 1 element
#         #   don't treat strings as sequences
#         for a,b in pdict.items():
#             if isinstance(b, collections.Sequence) and not isinstance(b, str): self.pdict[a] = b
#             else: self.pdict[a] = [b]
#         # our results are a sorted list, so the best score is always the final element
#         self.results = SortedList()       
                    
#     def grid_search(self,keys=None):
#         # do grid search on only the keys listed. If none provided, do all
#         if keys==None: keylist = self.pdict.keys()
#         else: keylist = keys
 
#         listoflists = [] # this will be list of lists of key,value pairs
#         for key in keylist: listoflists.append([(key,i) for i in self.pdict[key]])
#         for p in product(*listoflists):
#             # do any changes to the current best parameter set
#             if len(self.results)>0: template = self.results[-1][1]
#             else: template = {a:b[0] for a,b in self.pdict.items()}
#             # if our updates are the same as current best, don't bother
#             if self.equaldict(dict(p),template): continue
#             # take the current best and update just the ones to change
#             yield self.overwritedict(dict(p),template)
                              
#     def equaldict(self,a,b):
#         for key in a.keys(): 
#             if a[key] != b[key]: return False
#         return True            
                              
#     def overwritedict(self,new,old):
#         old = copy.deepcopy(old)
#         for key in new.keys(): old[key] = new[key]
#         return old            
    
#     # save a (score,params) pair to results. Since 'results' is a sorted list,
#     #   the best score is always the final element. A small amount of noise is added
#     #   because sorted lists don't like it when two scores are exactly the same    
#     def register_result(self,result,params):
#         self.results.add((result+np.random.randn()*1e-10,params))    
        
#     def bestscore(self):
#         return self.results[-1][0]
        
#     def bestparam(self):
#         return self.results[-1][1]

In [None]:
# import paramsearch
# from sklearn import metrics
# test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
# test_df = add_date_features(test_df)
# X_test = test_df[train_features]
# print(X_test.shape)

# num_ensembles = 5
# y_pred = 0.0

# params = {'depth':[5,6],
#           'iterations':[250,500],
#           'learning_rate':[0.01,0.03], 
#           'l2_leaf_reg':[3],
#           'thread_count':4}

# def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
#     kf = KFold(n_splits=n_splits,shuffle=True) 
#     res = []
#     for train_index, test_index in kf.split(train_set):
#         train = train_set.iloc[train_index,:]
#         test = train_set.iloc[test_index,:]

#         labels = train_label.ix[train_index]
#         test_labels = train_label.ix[test_index]

#         clf = cb.CatBoostRegressor(**params)
#         clf.fit(train, labels, cat_features=cat_feature_inds)

#         res.append((metrics.mean_absolute_error(clf.predict(test) , test_labels)))
#     return np.mean(res)


# # this function runs grid search on several parameters
# def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=2):
#     ps = Paramsearch(params)
#     # search 'border_count', 'l2_leaf_reg' etc. individually 
#     #   but 'iterations','learning_rate' together
#     for prms in chain(ps.grid_search(['l2_leaf_reg']),
#                       ps.grid_search(['iterations','learning_rate']),
#                       ps.grid_search(['depth'])):
#         res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
#         # save the crossvalidation result so that future iterations can reuse the best parameters
#         ps.register_result(res,prms)
#         print(res,prms,'best:',ps.bestscore(),ps.bestparam())
#     return ps.bestparam()

# bestparams = catboost_param_tune(params,X_train,y_train,cat_feature_inds)

In [None]:
# bestparams

In [None]:
bestparams = {'depth': 6,
 'iterations': 500,
 'l2_leaf_reg': 3,
 'learning_rate': 0.03,
 'thread_count': 4}

In [None]:
# bestparams

In [None]:
# train classifier with tuned parameters    
#clf = cb.CatBoostRegressor(**bestparams)
#clf.fit(train_set, np.ravel(train_label), cat_features=None)
#res = clf.predict(test_set)
# print('error:',1-np.mean(res==np.ravel(test_label)))



#for i in tqdm(range(num_ensembles)):
# TODO(you): Use CV, tune hyperparameters
model = CatBoostRegressor(
     iterations=bestparams['iterations'], learning_rate=bestparams['learning_rate'],
     depth=bestparams['depth'], l2_leaf_reg=bestparams['l2_leaf_reg'])
#     loss_function='MAE',
#     eval_metric='MAE',
#     random_seed=i)
# model = CatBoostRegressor(**params)
model.fit(
     X_train, y_train,
     cat_features=cat_feature_inds)

test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
test_df = add_date_features(test_df)
X_test = test_df[train_features]

#     eval_set = (X_val, y_val))
#     #y_pred += model.predict(X_val)
y_pred = model.predict(X_test)
# y_pred /= num_ensembles
#from sklearn import metrics
#y_pred = model.predict(X_val)
#print(metrics.mean_absolute_error(y_pred , y_val))

In [None]:
submission = pd.DataFrame({
    'ParcelId': test_df['parcelid'],
})
# https://www.kaggle.com/c/zillow-prize-1/discussion/33899, Oct,Nov,Dec
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    # TODO(you): predict for every `test_date`
    submission[label] = y_pred

submission_major = 1
submission.to_csv(
    'submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Done! Good luck with submission #%d :)" % submission_major)

In [1]:
!pwd

/home/gaolingnan/assignment1/zillow


In [None]:
#import paramsearch


In [None]:
# test_df['transactiondate'] = pd.Timestamp('2016-12-01')  # Dummy
# test_df = add_date_features(test_df)
# X_test = test_df[train_features]
# print(X_test.shape)

In [None]:
#y_pred = model.predict(X_test)