In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Statistical Tests
import scipy.stats as stats
import viz_kmeans
from scipy.stats import norm

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.dates as dates
import seaborn as sns
from sklearn.model_selection import learning_curve
import datetime

pd.options.display.float_format = '{:20,.2f}'.format

import env

from wrangle import clean_zillow, missing_zero_values_table, features_missing, handle_missing_values, get_zillow_data, split, seperate_y, scale_data, split_seperate_scale 

import explore

In [None]:
df = pd.read_csv("zillowcluster_df.csv")
df = clean_zillow(df)

In [None]:
df.head()

In [None]:
missing_zero_values_table(df)

In [None]:
train, validate, test = split(df)

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = seperate_y(train, validate, test)

In [None]:
train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test)

In [None]:
num_d = train.select_dtypes(exclude=['object', 'uint8', 'datetime64[ns]'])

In [None]:

quant_vars = list(num_d.columns.values)

In [None]:
quant_vars

In [None]:
def heat_map(train):
    plt.figure(figsize=(22,14))
    q = sns.heatmap(df.corr(), cmap='RdYlBu', annot=True, center=0)
    return q

In [None]:
def pair_plot(df):
    p = sns.pairplot(df)
    return p

In [None]:

sns.barplot(data = train, x = 'fips', y = 'logerror')
plt.title("Is the logerror different by county?")
plt.show()

In [None]:

sns.lmplot(x='calculatedfinishedsquarefeet', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()



In [None]:
sns.lmplot(x='acres_bin', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()


In [None]:
sns.lmplot(x='age_bin', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()

In [None]:
sns.lmplot(x='bathroomcnt', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()

In [None]:
sns.lmplot(x='taxrate', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()

In [None]:
sns.lmplot(x='baths_per_sqft', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()

In [None]:
sns.lmplot(x='lotsizesquarefeet', y='logerror', col='fips', data=train, fit_reg=True, hue='fips', palette=dict({6037:"cyan", 6059:"m", 6111:"orange"}), line_kws={'color': 'red'})


plt.show()

In [None]:

logroom = sns.jointplot(x="acres", y="logerror", data=train, kind='reg')
regline = logroom.ax_joint.get_lines()[0]
regline.set_color('red')
regline.set_zorder(5)

acreage seems to create a higher positive log error as it increases but there are some outliers that could be affecting this

In [None]:
sns.pairplot(data=train[['logerror','roomcnt','latitude', 'acres', 'taxrate']], corner=False )

In [None]:
# visualize outliers using boxplots:

plt.figure(figsize = (12,8))
plt.subplot(221)
sns.boxplot(y = train.logerror)

plt.subplot(222)
sns.boxplot(y = train.taxrate)

plt.subplot(223)
sns.boxplot(y = train.acres)

Lot of outliers with acreage and taxrate. Probably needs to be handled

In [None]:
# Calculate q1, q3 and iqr for income

q1 = df.acres.quantile(0.25)
q3 = df.acres.quantile(0.75)
iqr = q3 - q1
k = 3.0

# calculate upper and lower ranges
upper_bound_income =  q3 + k * iqr
lower_bound_income =  q1 - k * iqr

upper_bound_income

In [None]:
df.acres.median()

In [None]:
# Calculate q1, q3 and iqr for income

q1 = df.taxrate.quantile(0.25)
q3 = df.taxrate.quantile(0.75)
iqr = q3 - q1
k = 3.0

# calculate upper and lower ranges
upper_bound_income =  q3 + k * iqr
lower_bound_income =  q1 - k * iqr

upper_bound_income

In [None]:

# Look at the distribution of the target variable (log-error)
print(train['logerror'].describe())
train.loc[abs(train['logerror']) < .75, 'logerror'].hist(bins=40)

the distribution of the log error looks fairly normal. There appears to be some outliers around .6.

In [None]:
#define the thing
kmeans = KMeans(n_clusters=3)

# fit the thing
kmeans.fit(X_scaled)

# Use (predict using) the thing 
kmeans.predict(X_scaled)