In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue May  2 10:35:34 2017

@author: oakleye
"""

import sys
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from datetime import datetime
from sklearn.linear_model import Ridge, LassoCV, Lasso, ElasticNet, LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
def rmsle(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

np.random.seed(1)
pd.set_option("max_rows", 500)

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
macro = pd.read_csv("../input/macro.csv")

train = train.sample(frac=1, random_state=2)

alldata = pd.concat((train,
                      test))

alldata.drop("price_doc", axis=1, inplace=True)
df = alldata.join(how='left',on='timestamp',other=macro,rsuffix='macro')

#no/yes = false true
yesNoCols = ['big_market_raion', 'big_road1_1line', 'culture_objects_top_25',
             'detention_facility_raion', 'incineration_raion', 'nuclear_reactor_raion',
             'oil_chemistry_raion', 'radiation_raion', 'railroad_1line',
             'railroad_terminal_raion', 'thermal_power_plant_raion',
             'water_1line' ]
for col in yesNoCols:
    df[col] = df[col].apply(lambda x: False if x=='no' else True)

#ecology treatment
def ecoNum(s):
    if s=='poor': return 0
    elif s=='no data': return np.nan
    elif s=='good': return 2
    elif s=='excellent': return 3
    elif s=='satisfactory': return 1

df.ecology = df.ecology.apply(ecoNum)        

#CATEGORICAL para 1-N em algumas colunas
categoricals = ['product_type', 'sub_area']
for category in categoricals:
    df[category] = df[category].astype('category').cat.codes

#timestamp treatment
df.timestamp = df.timestamp.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
#timestamp => days ago
#time now
timestamp_max = datetime.strptime('2016-05-30', '%Y-%m-%d')
df.timestamp = df.timestamp.apply(lambda x: (timestamp_max - x).days)

drops = ['modern_education_share', 'old_education_build_share', 'child_on_acc_pre_school',
         'timestampmacro']
df.drop(axis=1, labels=drops, inplace=True)

#drop columns with all NA
df.dropna('columns', 'all', inplace=True)

#Fill na with -1
df.fillna(-1, inplace=True)



#creating matrices for sklearn:
X_train = df[:train.shape[0]]
X_test = df[train.shape[0]:]
y = train.price_doc



In [None]:
#X GRAD BOOST                                  
model_xgb = xgb.XGBRegressor( seed=1, silent=0, 
                              min_child_weight=1,
                              subsample=1.0, # % das samples usadas p treino
                              colsample_bytree=1.0,  # % de features usadas em cada arvore
                              gamma=0.3, reg_alpha=2.0, 
                                        reg_lambda=1.5,
                              n_estimators=100, max_depth=5, learning_rate=0.1)
                              #learn 0.2 = 250

percent=0.06
maxn = int(percent*len(X_train))

validation_x = X_train[maxn:maxn*2]
validation_y = y[maxn:maxn*2]
model = model_xgb.fit(X_train[:maxn], y[:maxn], eval_metric='rmse', verbose=True, eval_set=[(validation_x,validation_y)], early_stopping_rounds=50)


In [None]:
model = model_xgb
model.fit(X_train, y, eval_metric='rmse', verbose=True)

preds = model.predict(X_test)

solution = pd.DataFrame({"id":test.id, "price_doc":preds})
solution = solution[['id', 'price_doc']]
solution.to_csv("sol_xgb1.csv", index = False)