## Load the data ##

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv("../input/train.csv", parse_dates = ['timestamp'])
test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
data = train.drop(labels = 'price_doc', axis = 1).append(test, ignore_index = True)

In [None]:
#data.shape

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(train.price_doc, kde = False)

The price distribution is highly skewed, so better to do log transform.

In [None]:
plt.figure(figsize=(10, 5))
sns.distplot(np.log1p(train.price_doc), color = "orange", kde = False)
plt.xlabel("log(1+price)")

In [None]:
train['price_log'] = np.log1p(train['price_doc'])

### Missing Values ###

In [None]:
train.shape

In [None]:
for col in train.columns.values:
    if len(train[train[col].isnull()][col]) > 0:
        print("{0}: {1}".format(col, len(train[train[col].isnull()][col])))

### Seasonality ###

In [None]:
train['year'] = train['timestamp'].map(lambda x: x.strftime("%Y"))
train['month'] = train['timestamp'].map(lambda x: x.strftime("%m"))

In [None]:
sns.boxplot(data=train, x = 'year', y = 'price_log')

In [None]:
plt.figure(figsize = (12, 4))
sns.boxplot(data=train, x= 'month', y = 'price_log')

The median house price barely depends on the which month the houses are sold, but increases slight from year to year.
**From macro.csv data set, could look for the features that also increase from year basis and select those which may affect the housing price the most.**

In [None]:
macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])

In [None]:
macro['year'] = macro['timestamp'].map(lambda x: x.strftime("%Y"))

In [None]:
macro.plot(x = 'year', y = ['deposits_value', 'mortgage_value'], subplots=True)

In [None]:
macro['cpi_per_ppi'] = macro['cpi']/macro['ppi']
macro.plot(x = 'year', y = ['cpi', 'ppi', 'cpi_per_ppi'], subplots=True)

In [None]:
macro.plot(x = 'year', y = ['usdrub', 'eurrub'])

In [None]:
sns.boxplot(x = 'year', y = 'income_per_cap', data = macro)

**Explored a few features in macro.csv with seasonality. So far they don't seem to useful for housing price prediction. Won't use them for now.**

In [None]:
sns.regplot(x = 'build_year', y = 'price_log', 
            data = train[(train.build_year > 1900) & (train.build_year <= 2016) & train.build_year.notnull()])

In [None]:
sns.boxplot(x = 'state', y = 'price_log', data = train)

In [None]:
train['state'].value_counts()

## Data Cleaning ##

Drop the columns with too many NA's

In [None]:
drop_col = train.filter(regex = 'build_count', axis = 1).columns.values.tolist()
drop_col.extend(train.filter(regex = 'cafe_[sum|avg]', axis = 1).columns.values.tolist())
drop_col.extend(['hospital_beds_raion', 'build_year', 'max_floor'])
drop_col

In [None]:
data.drop(labels = drop_col, axis = 1, inplace=True)

In [None]:
data.loc[data.state == 33, 'state'] = 3