# Trying out...

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import model_selection, preprocessing
import xgboost as xgb
import math
#import datetime

In [None]:
train_df = pd.read_csv('../input/train.csv', parse_dates=['timestamp'])
test_df = pd.read_csv('../input/test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('../input/macro.csv', parse_dates=['timestamp'])

## Some cleanup and correction on most important features... let's see...

### cleaning up floors and max_floors  

In [None]:
# Few rows here to drop not much impact...

train_df.dropna(axis=0, subset=['floor'], how='any', inplace=True)
train_df.drop(train_df[train_df['floor'] == 0].index, inplace=True)

In [None]:
#Assuming Max_floor is equal to floor when it is not consistent.
train_df['max_floor'] = train_df['max_floor'].fillna(0)
train_df['max_floor'] = np.where(train_df['max_floor'] < train_df['floor'], train_df['floor'], train_df['max_floor'])

## Cleaning up Full_sq and life_Sq 

In [None]:
# wierd values corrected
train_df.set_value(train_df[train_df['state'] == 33].index, 'state', 4)
train_df.set_value(train_df[train_df['build_year'] == 20052009].index, 'build_year', 2007)
# 1 Outlier in full_Sq we will delete for the moment
train_df.drop(train_df[train_df['full_sq'] > 2000].index, inplace=True)

In [None]:
#droping rows where lif_sq is greater than full_sq (22 records )
train_df['bad_life'] = train_df['full_sq'] - train_df['life_sq']
train_df.drop(train_df[train_df['bad_life'] < 0].index, inplace=True)

#completing NaN values with mean ratio between Full and Life SQ
train_df['r_life_ful_sq'] = train_df['bad_life'] / train_df['full_sq']
mean_ratio = train_df['r_life_ful_sq'].mean()
train_df.life_sq.fillna(train_df.full_sq *(1 - mean_ratio), inplace=True)

# droping working columns
train_df.drop(['bad_life', 'r_life_ful_sq'], axis=1, inplace=True)

# Replacing life_sq < 5sq by mean ration full and life as for NaN
train_df['life_sq'] = np.where(train_df['life_sq'] <=5, train_df['full_sq'] * (1 - mean_ratio), train_df['life_sq'] )

## merging macroeconomics data 

In [None]:
dftrain = pd.merge(train_df, macro, how='left', on='timestamp')
dftest = pd.merge(test_df, macro, how='left', on='timestamp')

In [None]:
#y_train = dftrain["price_doc"]
x_train = dftrain.drop(["id", "timestamp"], axis=1)
x_test = dftest.drop(["id", "timestamp"], axis=1)

In [None]:
# these variables are empty in test set or their feat importance is assumed small atm... 
# we will revisit later it will grow bigger for sure
list_empty = ['grp','grp_growth','real_dispos_income_per_cap_growth', 'profitable_enterpr_share',
              'unprofitable_enterpr_share','share_own_revenues','overdue_wages_per_cap', 'fin_res_per_cap',
              'marriages_per_1000_cap','divorce_rate','construction_value', 'invest_fixed_assets_phys',
 'pop_migration','pop_total_inc','housing_fund_sqm','lodging_sqm_per_cap', 'water_pipes_share', 'baths_share',
 'sewerage_share','gas_share', 'hot_water_share', 'electric_stove_share', 'heating_share',
 'old_house_share', 'infant_mortarity_per_1000_cap', 'perinatal_mort_per_1000_cap', 'incidence_population',
 'load_of_teachers_preschool_per_teacher', 'child_on_acc_pre_school', 'provision_doctors',
 'power_clinics', 'hospital_beds_available_per_cap', 'hospital_bed_occupancy_per_year',
 'provision_retail_space_sqm', 'provision_retail_space_modern_sqm', 'theaters_viewers_per_1000_cap',
 'museum_visitis_per_100_cap', 'population_reg_sports_share',
 'students_reg_sports_share', 'apartment_build', 'modern_education_share', 'old_education_build_share', 
              'child_on_acc_pre_school']

In [None]:
x_train.drop(list_empty, axis=1, inplace=True)
x_test.drop(list_empty, axis=1, inplace=True)

In [None]:
for c in x_train.columns:
    if x_train[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))
        
for c in x_test.columns:
    if x_test[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))

In [None]:
def rmsle(preds, dtrain):
    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0, preds[i]) + 1)) ** 2.0 for i, pred in enumerate(labels)]
    return 'rmsle', (sum(terms_to_sum) * (1.0 / len(preds))) ** 0.5

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'silent': 1
}

y_train = x_train["price_doc"]
x_train.drop('price_doc', axis=1, inplace=True)

# Train/Valid split
split = 27000
xx_train, yy_train, xx_valid, yy_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

dtrain = xgb.DMatrix(xx_train, yy_train, feature_names=xx_train.columns.values)
dvalid = xgb.DMatrix(xx_valid, yy_valid, feature_names=xx_valid.columns.values)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(dict(xgb_params), dtrain, 600, watchlist, feval=rmsle, early_stopping_rounds=100)

In [None]:
featureImportance = model.get_fscore()
features = pd.DataFrame()
features['features'] = featureImportance.keys()
features['importance'] = featureImportance.values()
features.sort_values(by=['importance'],ascending=False,inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,25)
plt.xticks(rotation=60)
sns.set(font_scale=1.5)
sns.barplot(data=features.head(50),y="features",x="importance",ax=ax,orient="h")
#b.set_ylabel("features",fontsize=20)
#sns.plt.show()

In [None]:
p_test = model.predict(xgb.DMatrix(x_test))

sub = pd.DataFrame()
sub['id'] = dftest['id'].values
sub['price_doc'] = p_test
sub.to_csv('xgb.csv', index=False)

In [None]:
sub