## Goal
I am new to python3 and try to use this chance to get familiar with this tool and try to use some NLP and image processing libraries.
 
In this script, I want to understand the distribution of data and find the correlations between varaibles.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

### Import Data Set

In [None]:
train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")
print('Training data dimensions:',train_df.shape)
print('Testing data dimensions:',test_df.shape)

In [None]:
train_df.head(5)

In [None]:
train_df.info()

In [None]:
test_df.info()

The data set has no missing values.

### Check  Target Variables
The distribution of train data set is not evenly distributed, which will cause overfit on low  and underfit on medium and high. We can create two additional data set and weighted in our final model.

In [None]:
int_level = train_df['interest_level'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(int_level.index, int_level.values, alpha=0.8)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Interest level', fontsize=12)
plt.show()

## Simple Features

We want to see the effect of price on interest level. Higher price means low interest.

In [None]:

plt.figure(figsize=(10,5))
sns.barplot(train_df['interest_level'],train_df['price'], order=['low','medium','high'])
plt.title('Distribution of Interest Level by Price')
plt.ylabel('Price')
plt.xlabel('Interest level')
plt.show()

Bathroom & bedroom are not significant to interest level.

In [None]:
train_df['bathrooms'].ix[train_df['bathrooms']>3] = 3
plt.figure(figsize=(10,5))
sns.violinplot(x='interest_level', y='bathrooms', data=train_df, order=['low','medium','high'])
plt.xlabel('Interest level', fontsize=12)
plt.ylabel('Bathrooms', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.violinplot(x='interest_level', y='bedrooms', data=train_df, order=['low','medium','high'])
plt.xlabel('Interest level', fontsize=12)
plt.ylabel('Bedrooms', fontsize=12)
plt.show()

## Visualize Interest on Map
This will give us more features in our later model
https://www.kaggle.com/xchiron/two-sigma-connect-rental-listing-inquiries/exploratory-analysis-for-rental-listing-inquiries

In [None]:
terrain = sns.color_palette(palette='terrain',n_colors=10)
plasma = sns.color_palette(palette='plasma',n_colors=10)
%matplotlib inline

from bokeh.io import output_notebook
from bokeh.layouts import gridplot,row,column
from bokeh.plotting import figure,show
output_notebook()

p = figure(title="interest level based on geography",y_range=(40.65,40.85),x_range=(-74.05,-73.85))
p.xaxis.axis_label = 'latitude'
p.yaxis.axis_label = 'longitude'
lowLat=train_df['latitude'][train_df['interest_level']=='low']
lowLong=train_df['longitude'][train_df['interest_level']=='low']
medLat=train_df['latitude'][train_df['interest_level']=='medium']
medLong=train_df['longitude'][train_df['interest_level']=='medium']
highLat=train_df['latitude'][train_df['interest_level']=='high']
highLong=train_df['longitude'][train_df['interest_level']=='high']
p.circle(lowLong,lowLat,size=3,color=terrain.as_hex()[1],fill_alpha=0.1,line_alpha=0.1,legend='low')
p.circle(medLong,medLat,size=3,color=plasma.as_hex()[9],fill_alpha=0.1,line_alpha=0.1,legend='med')
p.circle(highLong,highLat,size=3,color=plasma.as_hex()[5],fill_alpha=0.1,line_alpha=0.1,legend='high')
show(p, notebook_handle=True)

## Text Mining Word Cloud
https://www.kaggle.com/cantstopwontstop/two-sigma-connect-rental-listing-inquiries/simple-exploration-notebook-2-connect-e113a9

In [None]:
from wordcloud import WordCloud

text = ''
text_da = ''
text_desc = ''
for ind, row in train_df.iterrows():
    for feature in row['features']:
        text = " ".join([text, "_".join(feature.strip().split(" "))])
    text_da = " ".join([text_da,"_".join(row['display_address'].strip().split(" "))])
    #text_desc = " ".join([text_desc, row['description']])
text = text.strip()
text_da = text_da.strip()
text_desc = text_desc.strip()

plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for features", fontsize=30)
plt.axis("off")
plt.show()


In [None]:
# wordcloud for display address
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Display Address", fontsize=30)
plt.axis("off")
plt.show()