Quick data exploration of Sberbank Rusian Housing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

import xgboost as xgb

from time import mktime
from datetime import datetime

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.



In [None]:
train_df = pd.read_csv('../input/train.csv')
print(train_df.shape)
train_df.head()

In [None]:
train_df.describe()

In [None]:
# Plot histogram of house price values
plt.figure(figsize = (10, 8))
dp = sns.distplot(train_df.price_doc.values, bins = 100)
dp.set_xscale('log')
plt.xlabel('House price')
plt.show()

In [None]:
# Prepare dataset

train_df = train_df.drop(['id'], axis = 1)

# Convert timestamp to Gregorian timestamp
toordinal = lambda x : datetime.strptime(x, '%Y-%m-%d').toordinal()
train_df['gregorian'] = train_df['timestamp'].apply(toordinal) # Converts to proleptic Gregorian ordinal.

# Convert catogorical values
num_df = train_df.select_dtypes(exclude = ['object'])
obj_df = train_df.select_dtypes(include = ['object']).copy()

for c in obj_df:
    obj_df[c] = pd.factorize(obj_df[c])[0]

train_df = pd.concat([num_df, obj_df], axis=1)
train_df

In [None]:
# Train a simple linear regression classifier with xgboost
X_train = train_df.drop(['price_doc'], axis = 1)
y_train = train_df['price_doc']

xgb_params = {
    'eta': 0.1,
    'max_depth': 7,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse'
}

dtrain = xgb.DMatrix(X_train, y_train)

In [None]:
cv_result = xgb.cv(xgb_params, dtrain, num_boost_round = 1000, early_stopping_rounds = 50, verbose_eval = True, show_stdv = False)