In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
input/data = pd.read_csv('./input/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('./input/test.csv', parse_dates=['timestamp'])

In [None]:
data.head()

In [None]:
data.describe()
data["area_m"].head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (10,8))
sns.distplot(data.price_doc.values, bins = 60, kde = True)
plt.xlabel('Price Doc', fontsize = 12)
plt.show()

In [None]:
def missing_plot(dataframe, figure_x, figure_y):
    df = dataframe.isnull().sum().reset_index()
    df.columns = ['column_name', 'na_count']
    df = df[df.na_count > 0]
    df = df.sort_values(by=['na_count'], ascending = [False])
    plt.figure(figsize=(figure_x, figure_y))
    sns.barplot(x="na_count",y ="column_name", data = df, orient="h")
    plt.xlabel('Missing count', fontsize=12)
    plt.show()
#missing_plot(data, 10,45)

In [None]:
data.fillna(data.mean(), inplace = True)
test.fillna(test.mean(), inplace = True)

In [None]:
macro = pd.read_csv('./input/macro.csv', parse_dates=['timestamp'])
def add_dates_to_data(x):
    # Add all data to left join by timestamp
    all_info = pd.merge_ordered(x, macro, on = 'timestamp', how = 'left')
    # Add more relevant date information
    all_info['year'] = all_info.timestamp.dt.year
    all_info['month'] = all_info.timestamp.dt.month
    all_info['dow'] = all_info.timestamp.dt.dayofweek
    all_info.drop(['timestamp'], axis = 1, inplace = True)
    # Other relevant
    all_info['rel_floor'] = all_info['floor'] / all_info['max_floor'].astype(float)
    all_info['rel_kitch_sq'] = all_info['kitch_sq'] / all_info['full_sq'].astype(float)
    all_info['area_rel_neigh'] = all_info['full_sq'] / all_info['area_m'].astype(float)
    return all_info
data = add_dates_to_data(data)
test = add_dates_to_data(test)

In [None]:
columns = data.columns.difference(["id", "price_doc", "timestamp"])
x = data[columns]
y = data["price_doc"]
test_id = test["id"]
test = test[columns]

In [None]:
def deal_with_categorical_values(X):
    X = X.select_dtypes(exclude=['object'])
    X_obj = X.select_dtypes(include=['object']).copy()

    for c in X_obj:
        X_obj[c] = pd.factorize(X_obj[c])[0]

    return pd.concat([X, X_obj], axis=1)
x = deal_with_categorical_values(x)
columns = x.columns
test = deal_with_categorical_values(test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)

In [None]:
import xgboost as xgb
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}

dtrain = xgb.DMatrix(x_train, y_train, feature_names=columns)
dtest = xgb.DMatrix(x_test, feature_names=columns)

In [None]:
model = xgb.train(xgb_params, dtrain, num_boost_round=300)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(model, max_num_features=30, height=0.5, ax=ax)
plt.show()

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error
predicted_test_y = model.predict(dtrain)
round_predicted_test_y = [int(value) for value in predicted_test_y]
print("mean_absolute_error: %.2f" % mean_absolute_error(y_train.values, round_predicted_test_y))
print("explained_variance_score: %.2f" % explained_variance_score(y_train.values, round_predicted_test_y))

In [None]:
test = xgb.DMatrix(test, feature_names=columns)
test_y = model.predict(test)

In [None]:
res = pd.DataFrame()
res['id'] = test_id.values
res['price_doc'] = test_y
res.to_csv('final.csv', index=False)