# We explore data through plotting graphs.

In [None]:
# Import basic packages
import pandas as pd
import numpy as np
pd.options.display.max_columns = None

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
import plotly.figure_factory as ff
from IPython.display import display

# Output plots in notebook
% matplotlib inline
% config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

### First, We view train.csv and macro.csv to understand datasets

### train.csv

In [None]:
train_data = pd.read_csv("../input/train.csv")
train_data.head()

In [None]:
train_data.shape

In [None]:
print(train_data.columns)

In [None]:
train_data.info(verbose=False)

In [None]:
train_data.isnull().sum().rename("NaN").to_frame().transpose()

#### target value is "price_doc"
* There are 30,421 train data and 7662 test data.
* There are 291 variables
* Some columns have many NaN value. So we can't remove NaN easily.

macro.csv

In [None]:
macro_data = pd.read_csv("../input/macro.csv")
macro_data.head()

In [None]:
macro_data.shape

In [None]:
macro_data.info(verbose=False)

In [None]:
macro_data.isnull().sum().rename("NaN").to_frame().transpose()

### macro data is joined to the train and test sets on the "timestamp".
* There are 100 columns.
* There are many NaN values in some rows.

We can't observe each columns as there are a lot of columns. 

# So Let's see target variable. "price_doc

### distplot

In [None]:
sns.set(style="whitegrid", font_scale=1.3)
plt.figure(figsize=(15,8))
ax = sns.distplot(train_data["price_doc"])
ax.set(xlim=(0,None))
plt.title("Price_doc distribution")

In [None]:
train_data["price_doc"].describe().to_frame().transpose()

mean: 7,123,035
std: 4,780,111

### correlation

In [None]:
corr_data = train_data.corr()
corr_target = corr_data[["price_doc"]]
corr_target["sort"] = corr_target["price_doc"].abs()
corr_target["column_name"] = corr_target.index
corr_target.sort_values("sort", ascending=False, inplace=True)

In [None]:
data = [go.Bar(
            x=corr_target["column_name"][1:15].values,
            y=corr_target["price_doc"][1:15].values
        )]
layout = go.Layout(
            title="Top 15 high correlation variables")
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="basic_bar")

|Name| Description|
|:-:|:-:|
|num_room|number of living rooms|
|full_sq|total area in square meters, including loggias, balconies and other non-residential areas|
|sport_count_XXX|the number of sport within XXXm of the property|
|trc_count_XXX|the number of shopping malls within XXXm of the property|
|zd_vokzaly_avto_km|The distances km to the train station by car|
|sadovoe_km|The distances km to the Garden Ring|
|kremlin_km|The distances km to the city center|
|bulvar_ring_km|The distances km to the Boulevard Ring|
|ttk_km|The distances km to the Third Transport Ring|
|office_sqm_XXX|square meters of office within XXXm of the property|
|office_trc_XXX|square meters of shopping malls within XXXm of the property|

In [None]:
train_data[corr_target.index[1:15]].describe()

In [None]:
train_data[corr_target.index[1:15]].isnull().sum().rename("NaN").to_frame().transpose()

Looking at the top 20 correlations, we discover the following things.
* Columns related with num_room and full_sq are more significant correlations than other variables of room size.
* But there are 9,572 unknown number of rooms.
* The number of sport and shopping malls are important variables.
* Distances to　the various ring are inversely propotional to the price_doc.

### Next merge train and macro data

In [None]:
df = pd.merge(train_data, macro_data, on="timestamp")
df.head()

In [None]:
df.info(verbose=False)

### As there are 19 string variables, we convert them into int through dummy variables`enter code here`

In [None]:
df_object_cols = df.columns.drop(df._get_numeric_data().columns)
df_object = df[df_object_cols]
df_object.head()

we convert timestamp into yyyy, mm, dd

In [None]:
dummies = pd.get_dummies(df_object.drop("timestamp", axis=1))

# timestamp
dummies["Year"] = df_object["timestamp"].apply(lambda x: int(x.split("-")[0]))
dummies["Month"] = df_object["timestamp"].apply(lambda x: int(x.split("-")[1]))
dummies["Day"] = df_object["timestamp"].apply(lambda x: int(x.split("-")[2]))

### concate original dateframe

In [None]:
df = pd.concat([df._get_numeric_data(), dummies], axis=1)
df.head()

In [None]:
df.info(verbose=False)

In [None]:
df.isnull().sum().rename("NaN").to_frame().transpose()

# That's it for now