In [1]:
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
import warnings
matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
%matplotlib inline
import seaborn as sns

### Load Data

In [2]:
train = pd.read_csv('../input/train_2016_v2.csv', parse_dates=["transactiondate"])
properties = pd.read_csv('../input/properties_2016.csv')

### Merging the DataSets

In [3]:
#merge data on key 
train_df_merged = pd.merge(train, properties, on='parcelid', how='left')
train_df_merged.head()

In [4]:
plt.figure(figsize=(12,8))
sns.distplot(train.logerror.values, bins=50, kde=False)
plt.xlabel('logerror', fontsize=12)
plt.show()

### Transaction Analysis

In [5]:
train['transaction_month'] = train['transactiondate'].dt.month

cnt_srs = train['transaction_month'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
plt.xticks(rotation='vertical')
plt.xlabel('Month of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

In [6]:
train['transaction_day'] = train['transactiondate'].dt.day

cnt_srs = train['transaction_day'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color = "#9b59b6")
plt.xticks(rotation='vertical')
plt.xlabel('Day of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()

### Geo Map Analysis

Those columns contain Geographic Information :
1. || latitude longitude
1. || propertycountylandusecode propertylandusetypeid propertyzoningdesc
1. || regionidcity regionidcounty regionidneighborhood regionidzip
1. || censustractandblock rawcensustractandblock

In [7]:
geocolumns = [  'latitude', 'longitude'
                            ,'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc'
                            ,'regionidcity','regionidcounty', 'regionidneighborhood', 'regionidzip'
                            ,'censustractandblock', 'rawcensustractandblock']

In [8]:
import gc

geoproperties = properties[geocolumns]

##### Let us explore the latitude and longitude variable to begin with

In [9]:
plt.figure(figsize=(12,12))
sns.jointplot(x=properties.latitude.values, y=properties.longitude.values, size=10)
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
plt.show()

In [10]:
nan_df = train_df_merged.isnull().sum(axis=0).reset_index()
nan_df.columns = ['column_name', 'missing_count']
nan_df['missing_ratio'] = nan_df['missing_count'] / train_df_merged.shape[0]
nan_df.ix[nan_df['missing_ratio']>0.999]

#### Let us take the variables with high correlation values and then do some analysis on them

In [11]:
col = "finishedsquarefeet12"
ulimit = np.percentile(train_df_merged[col].values, 99.5)
llimit = np.percentile(train_df_merged[col].values, 0.5)
train_df_merged[col].ix[train_df_merged[col]>ulimit] = ulimit
train_df_merged[col].ix[train_df_merged[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=train_df_merged.finishedsquarefeet12.values, y=train_df_merged.logerror.values, size=10)
plt.ylabel('Log Error', fontsize=12)
plt.xlabel('Finished Square Feet 12', fontsize=12)
plt.title("Finished square feet 12 Vs Log error", fontsize=15)
plt.show()

In [15]:
print('Correlation with Log Error')

print(train_df_merged.corr(method='pearson').drop(['logerror']).sort_values('logerror', ascending=False)['logerror'].head(14))
print('\n')

In [19]:
#Assigning the new DF !

corr_series = train_df_merged.corr(method='pearson').drop(['logerror']).sort_values('logerror', ascending=False)['logerror'].head(14)

In [26]:
corr_df = pd.DataFrame(corr_df)
corr_df = corr_df.reset_index()
corr_df.columns = ['column_name', 'correlation']

Continued :)