In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Libs

In [None]:
# Processing logic
import pandas as pd
import numpy as np
import xgboost as xgb

# System utils

import os

# Visualizations

from matplotlib import pyplot as plt

In [None]:
%matplotlib inline

# Exploration

In [None]:
data_dir = '../input/'

In [None]:
def quick_show(data, column, by=None):
    plt.figure()
    fig, axs = plt.subplots(1,2)
    data[column].hist(ax=axs[0])
    data.boxplot(column=column, by=by, ax=axs[1])

In [None]:
training_data = pd.read_csv(os.path.join(data_dir, "train.csv"), parse_dates=['timestamp'])
macro_economy_data = pd.read_csv(os.path.join(data_dir, "macro.csv"), parse_dates=['timestamp'])
testing_data = pd.read_csv(os.path.join(data_dir, "test.csv"), parse_dates=['timestamp'])

In [None]:
training_data.head(1).T

In [None]:
quick_show(training_data, 'life_sq', None)

In [None]:
training_data.describe()

# Data prep

Approach 1: Naive method of removing outliers. Cut everything above some percentile threshold.
TODO later: add more sophisticated methods like Mahalanobis distance

In [None]:
cutoff = 99

In [None]:
numeric_columns = list(training_data.select_dtypes(include=[np.number]).columns)
numeric_columns

In [None]:
col_percentiles = {}

In [None]:
for col in numeric_columns:
    if col not in ['id', 'price_doc']:
        col_percentile = np.nanpercentile(training_data[col], cutoff)
        col_percentiles[col] = col_percentile
        
col_percentiles

In [None]:
for col, percentile in col_percentiles.items():
    training_data.drop(training_data[training_data[col] > percentile].index, inplace=True)

## Training prep

In [None]:
y_train = training_data.price_doc
id_test = testing_data.id

In [None]:
training_data.drop(['id', 'price_doc'], axis=1, inplace=True)
testing_data.drop(['id'], axis=1, inplace=True)

In [None]:
num_train = len(training_data)
df_all = pd.concat([training_data, testing_data])
df_all = df_all.join(macro_economy_data, on='timestamp', rsuffix='_macro')
print(df_all.shape)

## Feature engineering

In [None]:
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Relative area of rooms
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)
df_all['avg_room_sq'] = (df_all['full_sq'] - df_all['kitch_sq']) / df_all['num_room']
df_all["ratio_life_sq_full_sq"] = df_all["life_sq"] / np.maximum(df_all["full_sq"].astype("float"),1)
df_all["ratio_life_sq_full_sq"].ix[df_all["ratio_life_sq_full_sq"]<0] = 0
df_all["ratio_life_sq_full_sq"].ix[df_all["ratio_life_sq_full_sq"]>1] = 1

# ratio of kitchen area to living area #
df_all["ratio_kitch_sq_life_sq"] = df_all["kitch_sq"] / np.maximum(df_all["life_sq"].astype("float"),1)
df_all["ratio_kitch_sq_life_sq"].ix[df_all["ratio_kitch_sq_life_sq"]<0] = 0
df_all["ratio_kitch_sq_life_sq"].ix[df_all["ratio_kitch_sq_life_sq"]>1] = 1

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp', 'timestamp_macro'], axis=1, inplace=True)

# People - related ratios
df_all['avg_male_subarea'] = df_all['full_all'] / df_all['male_f']

## Encoding

In [None]:
factorize = lambda t: pd.factorize(t[1])[0]

df_obj = df_all.select_dtypes(include=['object'])

X_all = np.c_[
    df_all.select_dtypes(exclude=['object']).values,
    np.array(list(map(factorize, df_obj.iteritems()))).T
]
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

In [None]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [None]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

df_columns = df_values.columns

# Training

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}

dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [None]:
rounds = 750

In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=rounds)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)

# Prediction

In [None]:
y_pred = model.predict(dtest)

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

df_sub.to_csv('simple_model.csv', index=False)

done :)