In [1]:
# imports

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import statsmodels.api as sm

from scipy.stats import pearsonr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [2]:
full_df1 = pd.read_csv('properties_2016.csv')
full_df2 = pd.read_csv('train_2016_v2.csv')

In [3]:
print(f'Shape: {full_df1.shape}')
print(f'Unique: {full_df1.parcelid.nunique()}')

Shape: (2985217, 58)
Unique: 2985217


In [4]:
full_df1.columns

Index(['parcelid', 'airconditioningtypeid', 'architecturalstyletypeid',
       'basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardbuildin

We're looking for things in Zillow's model that are causing problems

* What if the null rows are a factor of why the error is what it is
* Polynomial??
* Zipcodes - bin/group them?

Duplicates
* wtf do we do with them
* Newer homes have lower error - so take the most recent date parcelid

Hypotheses:
* Null values are negativly affecting logerror?
* Does the year built affect the tax assessed value?
* Further distance from mean of taxassessment increases logerror?
* Flipped homes causing variance?


Feature Engineering
* Tax Assessment per sqft
* Actual room count
* Tax Assessment per lotsize

In [5]:
df4 = full_df1.merge(full_df2, on='parcelid',how='inner')
df4.shape

(90275, 60)

In [6]:
df4.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,17073783,,,,2.5,3.0,,,2.5,,...,115087.0,191811.0,2015.0,76724.0,2015.06,,,61110020000000.0,0.0953,2016-01-27
1,17088994,,,,1.0,2.0,,,1.0,,...,143809.0,239679.0,2015.0,95870.0,2581.3,,,61110020000000.0,0.0198,2016-03-30
2,17100444,,,,2.0,3.0,,,2.0,,...,33619.0,47853.0,2015.0,14234.0,591.64,,,61110010000000.0,0.006,2016-05-27
3,17102429,,,,1.5,2.0,,,1.5,,...,45609.0,62914.0,2015.0,17305.0,682.78,,,61110010000000.0,-0.0566,2016-06-07
4,17109604,,,,2.5,4.0,,,2.5,,...,277000.0,554000.0,2015.0,277000.0,5886.92,,,61110010000000.0,0.0573,2016-08-08


In [7]:
df = df4[['logerror', 'bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet',\
          'lotsizesquarefeet', 'yearbuilt', 'taxvaluedollarcnt', 'regionidzip', 'landtaxvaluedollarcnt']]

In [8]:
df.head()

Unnamed: 0,logerror,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,lotsizesquarefeet,yearbuilt,taxvaluedollarcnt,regionidzip,landtaxvaluedollarcnt
0,0.0953,2.5,3.0,1264.0,1735.0,1986.0,191811.0,97081.0,76724.0
1,0.0198,1.0,2.0,777.0,,1990.0,239679.0,97083.0,95870.0
2,0.006,2.0,3.0,1101.0,6569.0,1956.0,47853.0,97113.0,14234.0
3,-0.0566,1.5,2.0,1554.0,7400.0,1965.0,62914.0,97113.0,17305.0
4,0.0573,2.5,4.0,2415.0,6326.0,1984.0,554000.0,97084.0,277000.0


In [9]:
df.isnull().sum()

logerror                            0
bathroomcnt                         0
bedroomcnt                          0
calculatedfinishedsquarefeet      661
lotsizesquarefeet               10150
yearbuilt                         756
taxvaluedollarcnt                   1
regionidzip                        35
landtaxvaluedollarcnt               1
dtype: int64

In [10]:

def summarize_df(df):
    print("Summary of Data\n")
    print("Rows & Columns:\n")
    print(df.shape)
    print("Variables:\n")
    var_names = df.columns.values
    print(var_names)
    print("\nColumn Info:\n")
    print(df.info())
    print("\nNumeric Summary Stats:\n")
    print(df.describe())
    print("\nTop 5 Values:\n")
    for var in var_names:
        print(var+":")
        print(df[var].value_counts().head())
        print('\n')
    print("\nMissing Values:\n")
    null_counts = df.isnull().sum()
    if len(null_counts[null_counts > 0]) == 0:
        print("No missing values")
    else:
        print(null_counts[null_counts > 0])
    print("\nFirst 5 rows:\n")
    print(df.head())
    print("\nEnd of Summary")

In [11]:
def percent_NaNs(df):
    cols = list(df.columns)
    Nans = {}
    for col in cols:
        Nans[col] = 'Missing {:0.2f}%'.format((df[col].isnull().sum()/df.shape[0]*100))
    return Nans

In [12]:
percent_NaNs(df)

{'logerror': 'Missing 0.00%',
 'bathroomcnt': 'Missing 0.00%',
 'bedroomcnt': 'Missing 0.00%',
 'calculatedfinishedsquarefeet': 'Missing 0.73%',
 'lotsizesquarefeet': 'Missing 11.24%',
 'yearbuilt': 'Missing 0.84%',
 'taxvaluedollarcnt': 'Missing 0.00%',
 'regionidzip': 'Missing 0.04%',
 'landtaxvaluedollarcnt': 'Missing 0.00%'}

In [13]:
df.describe()

Unnamed: 0,logerror,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,lotsizesquarefeet,yearbuilt,taxvaluedollarcnt,regionidzip,landtaxvaluedollarcnt
count,90275.0,90275.0,90275.0,89614.0,80125.0,89519.0,90274.0,90240.0,90274.0
mean,0.011457,2.279474,3.031869,1773.185987,29110.16,1968.53287,457672.6,96586.131184,278335.3
std,0.161079,1.004271,1.156436,928.162393,121721.3,23.763475,554884.4,3661.339094,400495.5
min,-4.605,0.0,0.0,2.0,167.0,1885.0,22.0,95982.0,22.0
25%,-0.0253,2.0,2.0,1184.0,5703.0,1953.0,199023.2,96193.0,82228.0
50%,0.006,2.0,3.0,1540.0,7200.0,1970.0,342872.0,96393.0,192970.0
75%,0.0392,3.0,4.0,2095.0,11686.0,1987.0,540589.0,96987.0,345419.5
max,4.737,20.0,16.0,22741.0,6971010.0,2015.0,27750000.0,399675.0,24500000.0


In [14]:
def replace_values(df,entry,orginal_value,new):
   df[entry] = df[entry].replace(orginal_value,new)
   return df
df = replace_values(df,'lotsizesquarefeet',np.nan, df['lotsizesquarefeet'].mean())

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90275 entries, 0 to 90274
Data columns (total 9 columns):
logerror                        90275 non-null float64
bathroomcnt                     90275 non-null float64
bedroomcnt                      90275 non-null float64
calculatedfinishedsquarefeet    89614 non-null float64
lotsizesquarefeet               90275 non-null float64
yearbuilt                       89519 non-null float64
taxvaluedollarcnt               90274 non-null float64
regionidzip                     90240 non-null float64
landtaxvaluedollarcnt           90274 non-null float64
dtypes: float64(9)
memory usage: 6.9 MB


In [16]:
df = df.dropna()

In [17]:
df.isnull().sum()

logerror                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
lotsizesquarefeet               0
yearbuilt                       0
taxvaluedollarcnt               0
regionidzip                     0
landtaxvaluedollarcnt           0
dtype: int64

In [18]:
df.shape

(89489, 9)

In [19]:
new_df = df.copy()

for col in df:
    q1 = df[col].quantile(.01)
    q3 = df[col].quantile(.99)
    new_df = new_df[(new_df[col] >= q1) & (new_df[col] <= q3)]

new_df.shape

(78769, 9)

In [20]:
new_df.describe()

Unnamed: 0,logerror,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,lotsizesquarefeet,yearbuilt,taxvaluedollarcnt,regionidzip,landtaxvaluedollarcnt
count,78769.0,78769.0,78769.0,78769.0,78769.0,78769.0,78769.0,78769.0,78769.0
mean,0.009021,2.263784,3.043266,1716.002577,21752.364094,1969.181696,419582.8,96551.262387,253545.6
std,0.079732,0.819872,0.962351,733.093564,43431.010237,22.280036,316984.6,404.248857,232048.0
min,-0.3383,1.0,1.0,619.0,1460.0,1911.0,38188.0,95986.0,13027.0
25%,-0.0243,2.0,2.0,1198.0,6000.0,1954.0,209018.0,96210.0,87933.0
50%,0.006,2.0,3.0,1539.0,7558.0,1970.0,348604.0,96403.0,197576.0
75%,0.0363,3.0,4.0,2054.0,20256.0,1986.0,534863.0,96987.0,342917.0
max,0.4587,5.5,6.0,5095.0,406305.0,2011.0,2463199.0,97329.0,1684667.0


In [21]:
log_upper = new_df[(new_df['logerror'] >= .0363)]
log_lower = new_df[(new_df['logerror'] <= -.0243)]

In [22]:
outer_log = new_df[(new_df['logerror'] >= .0363) | (new_df['logerror'] <= -.0243)]

In [37]:
#outer_looks = full_log.describe()
#full_looks = new_df.describe()

In [38]:
#full_log.shape

In [39]:
df7745 = pd.DataFrame()

In [46]:
d = {'col1': outer_looks.loc['mean'], 'col2': full_looks.loc['mean']}
df42 = pd.DataFrame(data=d)

NameError: name 'outer_looks' is not defined

In [None]:
df42

30 minutes gone

In [None]:
new_df['tax_sqft'] = new_df.taxvaluedollarcnt / new_df.calculatedfinishedsquarefeet

In [None]:
new_df.describe()

In [None]:
new_df.shape

In [None]:
new_df['room_count'] = new_df.bathroomcnt + new_df.bedroomcnt

In [None]:
new_df['lot_sqft'] = new_df.taxvaluedollarcnt / new_df.lotsizesquarefeet

In [None]:
new_df['abs_log'] = new_df.logerror.abs()

In [None]:
plot_count = 0

In [None]:
plt.figure(figsize=(25,50))

for i, col in enumerate(new_df):
    plot_number = i + 1
    series = new_df[col]
    plt.subplot(7,2,plot_number)
    plt.title(col)
    sns.distplot(series)

In [None]:
sns.pairplot(new_df)

In [47]:
def df_print_r_and_p_values(X, y):
    r_and_p_values = {col: stats.pearsonr(X[col], y) for col in X.columns}
    print("PEARSON'S R")
    for k, v in r_and_p_values.items():
        col = k
        r, p = v
        print(f"{col}:")
        print(
            f"\tPearson's R is {r:.2f} with a significance p-value of {p: .8}\n"
        )


def linreg_fit_and_predict(x_train, y_train, x_test, y_test):
    lm = LinearRegression()
    lm.fit(x_train, y_train)

    y_label = y_train.columns[0]
    y_intercept = lm.intercept_[0]
    m = lm.coef_[0][0]
    x_label = x_train.columns[0]
    print(f"Univariate: {y_label} = {y_intercept:.2f} + {m:.8}*{x_label}")
    print()

    preds_train = lm.predict(x_train)

    # run test data through model
    preds_test = lm.predict(x_test)

    return lm, preds_train, preds_test


def evaluate_model_train(x, y, preds):
    y_label = y.columns[0]
    x_label = x.columns[0]

    print("Model Evaluation on TRAIN Data")
    meanse = mean_squared_error(y, preds)
    print(f"\tMSE: {meanse:.8f}")

    medianae = median_absolute_error(y, preds)
    print(f"\tMAE: {medianae:.8f}")

    r2 = r2_score(y, preds)
    print(
        f"\t{r2:.2%} of the variance in {y_label} can be explained by {x_label}."
    )
    print()

    print("P-VALUE")
    f_vals, p_vals = f_regression(x, y)
    print(f"\tTrain: {p_vals[0]:.8}")
    print()


def evaluate_model_test(x, y, preds):
    y_label = y.columns[0]
    x_label = x.columns[0]

    print("Model Evaluation on TEST Data")
    meanse = mean_squared_error(y, preds)
    print(f"\tMSE: {meanse:.8f}")

    medianae = median_absolute_error(y, preds)
    print(f"\tMAE: {medianae:.8f}")

    r2 = r2_score(y, preds)
    print(
        f"\t{r2:.2%} of the variance in {y_label} can be explained by {x_label}."
    )
    print()

    print("P-VALUE")
    f_vals, p_vals = f_regression(x, y)
    print(f"\tTest: {p_vals[0]:.8}")
    print()


def plot_residuals(y_test, preds_test):
    y_label = y_test.columns[0]
    plt.scatter(preds_test, preds_test - y_test, c="g", s=20)
    plt.hlines(y=0, xmin=preds_test.min(), xmax=preds_test.max())
    plt.title("Residual plot")
    plt.ylabel("Residuals")
    plt.xlabel(y_label)
    plt.show()


def linreg_model(x_train, y_train, x_test, y_test):
    lm, preds_train, preds_test = linreg_fit_and_predict(
        x_train, y_train, x_test, y_test
    )

    evaluate_model_train(x_train, y_train, preds_train)
    evaluate_model_test(x_test, y_test, preds_test)

    plot_residuals(y_test, preds_test)
    

def evaluate_multi_model_train(X, y, preds):
    y_label = y.columns[0]
    X_labels = X.columns

    print("Model Evaluation on TRAIN Data")
    meanse = mean_squared_error(y, preds)
    print(f"\tMSE: {meanse:.8f}")

    medianae = median_absolute_error(y, preds)
    print(f"\tMAE: {medianae:.8f}")

    r2 = r2_score(y, preds)
    print(
        f"\t{r2:.2%} of the variance in {y_label} can be explained by {X_labels}."
    )
    print()

    print("P-VALUE")
    f_vals, p_vals = f_regression(X, y)
    print(f"\tTrain: {p_vals[0]:.8}")
    print()
    
def evaluate_multi_model_test(X, y, preds):
    y_label = y.columns[0]
    X_labels = X.columns

    print("Model Evaluation on TEST Data")
    meanse = mean_squared_error(y, preds)
    print(f"\tMSE: {meanse:.8f}")

    medianae = median_absolute_error(y, preds)
    print(f"\tMAE: {medianae:.8f}")

    r2 = r2_score(y, preds)
    print(
        f"\t{r2:.2%} of the variance in {y_label} can be explained by {X_labels}."
    )
    print()

    print("P-VALUE")
    f_vals, p_vals = f_regression(X, y)
    print(f"\tTest: {p_vals[0]:.8}")
    print()
    
def multi_linreg_fit_and_evaluate(X_train, y_train, X_test, y_test):
    lm = LinearRegression()
    lm.fit(X_train, y_train)

    y_label = y_train.columns[0]
    y_intercept = lm.intercept_[0]
    print("Multivariate:")
    print(f"{y_label} = ")
    print(f"{y_intercept:.8f}")
    for i, col in enumerate(X_train.columns):
        coefficient = lm.coef_[0][i]
        print(f"+ {coefficient:.8}*{col}")

    preds_train = lm.predict(X_train)
    evaluate_multi_model_train(X_train, y_train, preds_train)
    
    preds_test = lm.predict(X_test)
    evaluate_model_test(X_test, y_test, preds_test)
    
    plot_residuals(y_test, preds_test)


def normalize_cols(df_train, df_test, cols):
    df_train_norm = pd.DataFrame()
    for col in cols:
        minimum = df_train[col].min()
        maximum = df_train[col].max()
        df_train_norm[f"{col}_norm"] = (df_train[col] - minimum) / (maximum - minimum)
    
    df_test_norm = pd.DataFrame()
    for col in cols:
        minimum = df_train[col].min()  # use the min and max from the train set
        maximum = df_train[col].max()
        df_test_norm[f"{col}_norm"] = (df_test[col] - minimum) / (maximum - minimum)
    return df_train_norm, df_test_norm

In [48]:
#X = new_df.drop(['logerror', 'abs_log'], axis=1)
print(X.head())

#y = new_df[['abs_log']]
print(y.head())

NameError: name 'X' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.60, random_state=42)

In [None]:
from sklearn.feature_selection import f_regression

In [None]:
multi_linreg_fit_and_evaluate(X_train, y_train, X_test, y_test)

In [None]:
linreg_model(X_train, y_train, X_test, y_test)

In [None]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
# XX, yy = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = SVR(kernel="sigmoid")
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X_train, y_train)
print(f'support {selector.support_}')
print(f'ranking {selector.ranking_}')
print(f'X_cols {X_train.columns}')
# print(f'yy {}')
