https://towardsdatascience.com/linear-regression-using-python-b136c91bf0a2

https://towardsdatascience.com/linear-regression-on-boston-housing-dataset-f409b7e4a155

# EDA and Linear Regression on Allegheny County Housing Data

## Import packages

In [1]:
# data and math packages
import numpy as np
import pandas as pd
from scipy import stats
import math 
import random
import researchpy as rp

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

# plot style
from jupyterthemes import jtplot
jtplot.style('monokai')

## Data Prep / Pipeline

Developed in the `Statistics in Data Science - A Review` notebook.

In [None]:
# read in our data and the key for interpreting some of the values
ach_df = pd.read_csv('data/AlleghenyHousing/assessments.csv')

# lets convert the ASOFDATE column to datetime
ach_df['ASOFDATE'] = pd.to_datetime(ach_df['ASOFDATE'])

# filter out majority of unecessary columns
keep_cols = [
            # parced ID
            'PARID',
            
            # most recent sale information
            'SALEDATE',
            'SALEPRICE',
            'SALECODE',
            'SALEDESC',
    
            # appraisal and market values
            'FAIRMARKETLAND',
            'FAIRMARKETBUILDING',
            'FAIRMARKETTOTAL',
            'COUNTYLAND',
            'COUNTYBUILDING',
            'COUNTYTOTAL',
            'LOCALLAND',
            'LOCALBUILDING', 
            'LOCALTOTAL',
    
            # location information and etc.
            'PROPERTYZIP',
            'MUNICODE',
            'SCHOOLCODE',
            'NEIGHCODE',
            'OWNERCODE',
            'CLASS',
            'USECODE',
    
            # parcel / building features
            'LOTAREA',
            'STYLE',
            'STORIES',
            'YEARBLT',
            'EXTERIORFINISH',
            'ROOF',
            'BASMENT',
            'TOTALROOMS',
            'FULLBATHS',
            'HALFBATHS',
            'HEATING',
            'COOLING',
            'FIREPLACES',
            'BSMNTGARAGE',
            'FINISHEDLIVINGAREA']
ach_dfr = ach_df.filter(keep_cols, axis=1)


# filter out any outliers in terms of sale price, flter out properties that dont have a building on them
ach_dfr = ach_dfr[ach_dfr['FAIRMARKETBUILDING'] > 0]
ach_dfr = ach_dfr[ach_dfr['SALEDESC'] == "VALID SALE"].append(ach_dfr[ach_dfr['SALEDESC'] == "OTHER VALID"])


ach_types = pd.DataFrame(ach_dfr.dtypes)
ach_types_r = ach_types[ach_types[0] == "O"]


drop_cols = ['USECODE', 'CLASS', 'OWNERCODE', 'COUNTYLAND',
            'COUNTYTOTAL', 'COUNTYBUILDING', 'LOCALLAND', 
            'LOCALBUILDING', 'LOCALTOTAL']
ach_dfr.drop(drop_cols,axis=1, inplace=True)

ach_dfr.drop('FIREPLACES',axis=1, inplace=True)

ach_dfr = ach_dfr.loc[ach_dfr['FINISHEDLIVINGAREA'] >= 0]

ach_dfr['HALFBATHS'] = ach_dfr['HALFBATHS'].fillna(0)

ach_dfr = ach_dfr.dropna()

ach_dfr = ach_dfr.drop(['PARID', 'SALECODE', 'SALEDESC'], axis=1)

ach_dfr = ach_dfr.drop(['FAIRMARKETLAND','FAIRMARKETBUILDING','FAIRMARKETTOTAL'], axis=1)

ach_dfr['SALEDATE'] = pd.to_datetime(ach_dfr['SALEDATE'])

ach_dfr = ach_dfr.loc[ach_dfr['SALEDATE'] > pd.to_datetime('01-01-2008')]

ach_dfr = ach_dfr.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
ach_dfr['PROPERTYZIP'] = pd.to_numeric(ach_dfr['PROPERTYZIP'], downcast='integer')

In [None]:
#ach_dfr

## Define Parameters

In [None]:
data = ach_dfr
target = ach_dfr['SALEPRICE']
feature_names = list(ach_dfr)

In [None]:
feature_names

In [None]:
# note that some  of these are meaningless, such as PROPERTYZIP

ach_dfr.describe()

## Exploratory Data Analysis

In [None]:
ach_dfr['SALEPRICE'].hist(bins=50, figsize=(15,15))

### New Value: median value by location

In [None]:
ach_dfr['PROPERTYZIP'].unique()

In [None]:
ach_dfr['MUNICODE'].unique()

In [None]:
ach_dfr['SCHOOLCODE'].unique()

In [None]:
def medianizer(df_col_name, new_col_name):    
    # find the median value of homes by location
    medPriceBy = []
    for v in np.asarray(ach_dfr[df_col_name].unique()):
        prices = []
        for i in range(len(ach_dfr)):
            if ach_dfr.loc[i, df_col_name] == v:
                prices += [ach_dfr.loc[i, 'SALEPRICE']]

        medPriceBy += [[str(v), np.median(prices)]]
        
    medPriceBy = np.asarray(medPriceBy)
    medPriceBy[:,0] = medPriceBy[:,0].astype(np.int)

    print(medPriceBy)
    # make new column in our dataframe
    ach_dfr[new_col_name] = 0
    

    # add correct value to each instant in dataframe
    def medianity(medians):
        medians_ = []
        for j in np.asarray(ach_dfr[df_col_name]):
            for i in medians:
                if ach_dfr.loc[j, df_col_name] == int(i[0]):
                    medians_ += [i[1]]
                else: 
                    None
        return medians_

    try:
        ach_dfr[new_col_name] = medianity(medPriceBy)
    except: 
        print('Oh pooh!')

In [None]:
medianizer('PROPERTYZIP', 'MEDIANPRICEBYZIP')
medianizer('SCHOOLCODE', 'MEDIANPRICEBYSCHOOL')
medianizer('MUNICODE', 'MEDIANPRICEBYMUNI')

In [None]:
ach_dfr['MEDIANPRICEBYZIP'] = pd.to_numeric(ach_dfr['MEDIANPRICEBYZIP'], downcast='float')
ach_dfr['MEDIANPRICEBYSCHOOL'] = pd.to_numeric(ach_dfr['MEDIANPRICEBYSCHOOL'], downcast='float')
ach_dfr['MEDIANPRICEBYMUNI'] = pd.to_numeric(ach_dfr['MEDIANPRICEBYMUNI'], downcast='float')

In [None]:
ach_df__ = ach_dfr.copy()

In [None]:
ach_df__.hist(bins=30, figsize=(15,15))

### Visualize Correlation Matrix

In [None]:
ach_df__.corr()

In [None]:
ach_df__ = ach_df__.drop('MEDIANPRICEBYSCHOOL', axis=1)

In [None]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()

trace = go.Heatmap(z=np.asarray(ach_df__.corr()),
                   x = list(ach_df__),
                   y = list(ach_df__), colorscale='Viridis')

data=[trace]
fig = go.Figure(data=data)

fig['layout'].update(width=800,height=800)


py.iplot(fig)

### Observing Linearity

Take the features with high correlation and plot them against `SALEPRICE` and `MEDIANPRICEBY...`.

In [None]:
list(ach_df__)

In [None]:
plt.figure(figsize=(20,5))

features = ['SALEDATE',
             'PROPERTYZIP',
             'MUNICODE',
             'SCHOOLCODE',
             'LOTAREA',
             'STORIES',
             'YEARBLT',
             'EXTERIORFINISH',
             'ROOF',
             'TOTALROOMS',
             'FULLBATHS',
             'HALFBATHS',
             'FINISHEDLIVINGAREA']

target = ach_dfr['SALEPRICE']

for i, col in enumerate(features[0:4]):
    try:
        plt.subplot(1, len(features[0:4]) , i+1)
        x = ach_dfr[col]
        y = target
        plt.scatter(x, y, marker='o')
        plt.title(col)
        plt.xlabel(col)
        plt.ylabel('PRICE')
    except: None

In [None]:
plt.figure(figsize=(20,5))
for i, col in enumerate(features[4:8]):
    try:
        plt.subplot(1, len(features[0:4]) , i+1)
        x = ach_dfr[col]
        y = target
        plt.scatter(x, y, marker='o')
        plt.title(col)
        plt.xlabel(col)
        plt.ylabel('PRICE')
    except: None
        

In [None]:
plt.figure(figsize=(20,5))
for i, col in enumerate(features[8:13]):
    try:
        plt.subplot(1, len(features[8:13]) , i+1)
        x = ach_dfr[col]
        y = target
        plt.scatter(x, y, marker='o')
        plt.title(col)
        plt.xlabel(col)
        plt.ylabel('PRICE')
    except: None

In [None]:
ach_dfr['SALEPRICE'].dtypes()

### Linear Regression

Obviously there aren't any visually satisfying shapes in the data that compel us to think Linear Regression is a good model. Lets more forward anyway.

In [None]:
X = pd.DataFrame(ach_df__[[  'PROPERTYZIP',
                             'MUNICODE',
                             'SCHOOLCODE',
                             'LOTAREA',
                             'STORIES',
                             'YEARBLT',
                             'EXTERIORFINISH',
                             'ROOF',
                             'TOTALROOMS',
                             'FULLBATHS',
                             'HALFBATHS',
                             'FINISHEDLIVINGAREA']])
Y = ach_df__['SALEPRICE']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

In [None]:
import sklearn as sk

y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))