# Zillow Prize | Exploration of all 58 features

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Zillow data dictionary

In [None]:
features_dictionary = pd.read_excel('../input/zillow_data_dictionary.xlsx')
features_dictionary['Feature'] = features_dictionary['Feature'].apply(lambda x: x.replace("'", ""))
features_dictionary.head()

In [None]:
n_features = len(features_dictionary)

## Property data analysis

- **properties_2016.csv - all the properties with their home features for 2016.** Note: Some 2017 new properties don't have any data yet except for their parcelid's. Those data points should be populated when properties_2017.csv is available.

- properties_2017.csv - all the properties with their home features for 2017 (will be available on 10/2/2017)

- **train_2016.csv - the training set with transactions from 1/1/2016 to 12/31/2016**

- train_2017.csv - the training set with transactions from 1/1/2017 to 9/15/2017 (will be available on 10/2/2017) 

Features :

- 58 property features
- Transaction date

To predict :

- Log error of Zestimate : $logerr = log(Zestimate) - log(SalePrice)$ for October 2016 (201610), November 2016 (201611), December 2016 (201612), October 2017 (201710), November 2017 (201711), and December 2017 (201712)


**properties_2016.csv : 2 985 217 properties**

In [None]:
properties_df = pd.read_csv('../input/properties_2016.csv')
properties_df.head()

In [None]:
properties_df.shape

In [None]:
missing_df = pd.DataFrame({'Missing': properties_df.isnull().sum()/len(properties_df)})
missing_df.sort_values(by="Missing", ascending=True, inplace=True)

fig,ax = plt.subplots(figsize=(10,15))
ax.barh(np.arange(n_features), missing_df['Missing'])
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(missing_df.index)
plt.show()

Let's start analyzing the features in order of decreasing null values.

In [None]:
def summary(f):
    print('Description: '+features_dictionary[features_dictionary['Feature'] == f]['Description'].values[0])
    print('Type: {}'.format(properties_df[f].dtype))
    print('Missing: {}%'.format(100*missing_df.loc[f].values[0]))

### parcelid

In [None]:
summary('parcelid')

In [None]:
properties_df['parcelid'].min(), properties_df['parcelid'].mean(), properties_df['parcelid'].max()

In [None]:
fig,ax = plt.subplots()
sns.distplot(np.log10(properties_df['parcelid']), kde=False, ax=ax)

- 2 976 612 properties have a parcelid between 10 711 725 and 26 600 747
- 1 383 have a parcelid between 74 267 814 and 100 000 000
- 7 222 have a parcelid between 150 000 000 and 169 601 949

### fips

In [None]:
summary('fips')

In [None]:
fig,ax = plt.subplots()
sns.countplot(properties_df['fips'], ax=ax)
ax.set_xticklabels(['6037 (Los Angeles)', '6059 (Orange)', '6111 (Ventura)'])

### propertylandusetypeid

In [None]:
summary('propertylandusetypeid')

In [None]:
fig,ax = plt.subplots(figsize=(8,3))
sns.countplot(properties_df['propertylandusetypeid'], ax=ax)

### rawcensustrackandblock

In [None]:
summary('rawcensustractandblock')

In [None]:
len(properties_df['rawcensustractandblock'].unique())

### regionidcounty

In [None]:
summary('regionidcounty')

In [None]:
fig,ax = plt.subplots(figsize=(8,3))
sns.countplot(properties_df['regionidcounty'], ax=ax)

### longitude/latitude

In [None]:
summary('longitude')
summary('latitude')

In [None]:
colors = properties_df['fips'].dropna().map({6037: 'red', 6059: 'blue', 6111: 'green'})
sample = np.random.randint(0, len(properties_df['longitude'].dropna()), size=10000)
plt.scatter(properties_df['longitude'].dropna()[sample], properties_df['latitude'].dropna()[sample], c=colors[sample])

### assessment year

In [None]:
summary('assessmentyear')

In [None]:
(properties_df['assessmentyear'] == 2015).mean()

99.57% of all homes were assessed in 2015!

### bedroomcnt

In [None]:
summary('bedroomcnt')

In [None]:
fig,ax = plt.subplots(figsize=(8,3))
sns.countplot(properties_df['bedroomcnt'], ax=ax)

### bathroomcnt

In [None]:
summary('bathroomcnt')

In [None]:
fig,ax = plt.subplots(figsize=(12,3))
sns.countplot(properties_df['bathroomcnt'], ax=ax)

### roomcnt

In [None]:
summary('roomcnt')

In [None]:
fig,ax = plt.subplots(figsize=(12,3))
sns.countplot(properties_df['roomcnt'], ax=ax)

### propertycountylandusecode

In [None]:
summary('propertycountylandusecode')

In [None]:
len(properties_df['propertycountylandusecode'].unique())

In [None]:
fig,ax = plt.subplots(3, 1, figsize=(12,6))
for i,county in enumerate([6037, 6059, 6111]):
    sns.countplot(properties_df[properties_df['fips'] == county]['propertycountylandusecode'], ax=ax[i])

### regionidzip

In [None]:
summary('regionidzip')

In [None]:
properties_df['regionidzip'].max()

In [None]:
(properties_df['regionidzip'] == properties_df['regionidzip'].max()).sum()

We have 431 "outliers" with a zip code of 399675.

In [None]:
sns.distplot(properties_df[properties_df['regionidzip'] < 300000]['regionidzip'].dropna(), kde=False)

### taxamount

In [None]:
summary('taxamount')

In [None]:
properties_df['taxamount'].mean()

In [None]:
sns.distplot(np.log10(properties_df['taxamount'].dropna()), kde=False)

### taxvaluedollarcnt

In [None]:
summary('taxvaluedollarcnt')

In [None]:
properties_df['taxvaluedollarcnt'].mean()

In [None]:
sns.distplot(np.log10(properties_df['taxvaluedollarcnt'].dropna()), kde=False)

### structuretaxvaluedollarcnt

In [None]:
summary('structuretaxvaluedollarcnt')

In [None]:
properties_df['structuretaxvaluedollarcnt'].mean()

In [None]:
sns.distplot(np.log10(properties_df['structuretaxvaluedollarcnt'].dropna()), kde=False)

### calculatedfinishedsquarefeet

In [None]:
summary('calculatedfinishedsquarefeet')

In [None]:
sns.distplot(np.log10(properties_df['calculatedfinishedsquarefeet'].dropna()), kde=False)

### yearbuilt

In [None]:
summary('yearbuilt')

In [None]:
properties_df['yearbuilt'].min(), properties_df['yearbuilt'].max()

In [None]:
sns.distplot(properties_df['yearbuilt'].dropna(), kde=False)

### regionidcity

In [None]:
summary('regionidcity')

In [None]:
fig,ax = plt.subplots(3, 1, figsize=(12,6))
for i,county in enumerate([6037, 6059, 6111]):
    sns.countplot(properties_df[properties_df['fips'] == county]['regionidcity'], ax=ax[i])

### landtaxvaluedollarcnt

In [None]:
summary('landtaxvaluedollarcnt')

In [None]:
sns.distplot(np.log10(properties_df['landtaxvaluedollarcnt'].dropna()), kde=False)

### censustractandblock
Same as rawcensustractandblock. We can drop the raw version.

In [None]:
summary('censustractandblock')

In [None]:
properties_df['censustractandblock'][1000], properties_df['rawcensustractandblock'][1000]

### fullbathcnt

In [None]:
summary('fullbathcnt')

In [None]:
sns.countplot(properties_df['fullbathcnt'])

### calculatedbathnbr

In [None]:
summary('calculatedbathnbr')

In [None]:
sns.countplot(properties_df['calculatedbathnbr'])

### finishedsquarefeet12

In [None]:
summary('finishedsquarefeet12')

In [None]:
sns.distplot(np.log10(properties_df['finishedsquarefeet12'].dropna()), kde=False)

### lotsizesquarefeet

In [None]:
summary('lotsizesquarefeet')

In [None]:
sns.distplot(np.log10(properties_df['lotsizesquarefeet'].dropna()), kde=False)

### propertyzoningdesc

In [None]:
summary('propertyzoningdesc')

In [None]:
properties_df['propertyzoningdesc'].unique().size

Don't use it for the moment.

### unitcnt

In [None]:
summary('unitcnt')

In [None]:
properties_df['unitcnt'].min(), properties_df['unitcnt'].mean(), properties_df['unitcnt'].max()

In [None]:
sns.countplot(np.clip(properties_df['unitcnt'], 0, 10))

### buildingqualitytypeid

In [None]:
summary('buildingqualitytypeid')

In [None]:
properties_df['buildingqualitytypeid'].mean()

In [None]:
properties_df['buildingqualitytypeid'].unique()

In [None]:
sns.countplot(properties_df['buildingqualitytypeid'])

### heatingorsystemtypeid

In [None]:
summary('heatingorsystemtypeid')

In [None]:
sns.countplot(properties_df['heatingorsystemtypeid'])

### regionidneighborhood

In [None]:
summary('regionidneighborhood')

In [None]:
properties_df['regionidneighborhood'].unique().size

### garagecarcnt

In [None]:
summary('garagecarcnt')

In [None]:
sns.countplot(np.clip(properties_df['garagecarcnt'], 0, 10))

### garagetotalsqft

In [None]:
summary('garagetotalsqft')

In [None]:
sns.distplot(np.clip(properties_df['garagetotalsqft'].dropna(), 0, 2000), kde=False)

### airconditioningtypeid

In [None]:
summary('airconditioningtypeid')

In [None]:
sns.countplot(properties_df['airconditioningtypeid'])

### numberofstories

In [None]:
summary('numberofstories')

In [None]:
sns.countplot(properties_df['numberofstories'])

### poolcnt

In [None]:
summary('poolcnt')

In [None]:
properties_df['poolcnt'].unique()

### pooltypeid7

In [None]:
summary('pooltypeid7')

In [None]:
properties_df['pooltypeid7'].unique()

### fireplacecnt

In [None]:
summary('fireplacecnt')

In [None]:
sns.countplot(properties_df['fireplacecnt'].fillna(0.))

### threequarterbathnbr

In [None]:
summary('threequarterbathnbr')

In [None]:
sns.countplot(properties_df['threequarterbathnbr'].fillna(0.))

### finishedfloor1squarefeet

In [None]:
summary('finishedfloor1squarefeet')

In [None]:
sns.distplot(np.log10(properties_df['finishedfloor1squarefeet'].dropna()), kde=False)

### finishedsquarefeet50

In [None]:
summary('finishedsquarefeet50')

In [None]:
(np.abs(properties_df['finishedsquarefeet50'].dropna() - properties_df['finishedfloor1squarefeet'].dropna())).mean()

Both features are approximately the same but not exactly... Maybe we can get rid of one of the two.

In [None]:
sns.distplot(np.log10(properties_df['finishedsquarefeet50'].dropna()), kde=False)

### finishedsquarefeet15

In [None]:
summary('finishedsquarefeet15')

In [None]:
sns.distplot(np.log10(properties_df['finishedsquarefeet15'].dropna()), kde=False)

### yardbuildingsqft17

In [None]:
summary('yardbuildingsqft17')

In [None]:
sns.distplot(np.log10(properties_df['yardbuildingsqft17'].dropna()), kde=False)

### hashottuborspa

In [None]:
summary('hashottuborspa')

In [None]:
properties_df['hashottuborspa'].unique()

### taxdelinquencyyear

In [None]:
summary('taxdelinquencyyear')

In [None]:
properties_df['taxdelinquencyyear'].unique()

In [None]:
properties_df['taxdelinquencyyear'].apply(lambda x: 1900+x if x>80 else 2000+x).unique()

In [None]:
fig,ax = plt.subplots(figsize=(10,3))
sns.countplot(properties_df['taxdelinquencyyear'].apply(lambda x: 1900+x if x>80 else 2000+x), ax=ax)

### taxdelinquencyflag

In [None]:
summary('taxdelinquencyflag')

In [None]:
properties_df['taxdelinquencyflag'].unique()

### pooltypeid10

In [None]:
summary('pooltypeid10')

In [None]:
properties_df['pooltypeid10'].unique()

### pooltypeid2

In [None]:
summary('pooltypeid2')

In [None]:
properties_df['pooltypeid2'].unique()

### poolsizesum

In [None]:
summary('poolsizesum')

In [None]:
sns.distplot(np.log10(properties_df['poolsizesum'].dropna()), kde=False)

### finishedsquarefeet6

In [None]:
summary('finishedsquarefeet6')

In [None]:
sns.distplot(np.log10(properties_df['finishedsquarefeet6'].dropna()), kde=False)

### decktypeid

In [None]:
summary('decktypeid')

In [None]:
properties_df['decktypeid'].unique()

### buildingclasstypeid

In [None]:
summary('buildingclasstypeid')

In [None]:
sns.countplot(properties_df['buildingclasstypeid'])

### finishedsquarefeet13

In [None]:
summary('finishedsquarefeet13')

In [None]:
sns.distplot(np.log10(properties_df['finishedsquarefeet13'].dropna()), kde=False)

### typeconstructiontypeid

In [None]:
summary('typeconstructiontypeid')

In [None]:
sns.countplot(properties_df['typeconstructiontypeid'])

### architecturalstyletypeid

In [None]:
summary('architecturalstyletypeid')

In [None]:
properties_df['architecturalstyletypeid'].unique()

In [None]:
sns.countplot(properties_df['architecturalstyletypeid'])

### fireplaceflag

In [None]:
summary('fireplaceflag')

Tag feature.

### yardbuildingsqft26

In [None]:
summary('yardbuildingsqft26')

In [None]:
sns.distplot(np.log10(properties_df['yardbuildingsqft26'].dropna()), kde=False)

### basementsqft

In [None]:
summary('basementsqft')

In [None]:
sns.distplot(np.log10(properties_df['basementsqft'].dropna()), kde=False)

### storytypeid

In [None]:
summary('storytypeid')

In [None]:
properties_df['storytypeid'].unique()

**Ok, we now have a good overview of the features present in the properties 2016 data, and we can start filling all those NaNs and pre-processing the data before fitting some models on it!** 