In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

from scipy import stats
from scipy.stats import norm
import seaborn as sns

In [3]:
# Import data
df = pd.read_csv(Path('../resources/regressiondata.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,zpid,zipcode,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,...,homeType_Apartment,homeType_Condo,homeType_Mobile / Manufactured,homeType_MultiFamily,homeType_Multiple Occupancy,homeType_Other,homeType_Residential,homeType_Single Family,homeType_Townhouse,homeType_Vacant Land
0,0,111373431,78660,30.430632,-97.663078,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,120900430,78660,30.432673,-97.661697,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,2,2084491383,78660,30.409748,-97.639771,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,3,120901374,78660,30.432112,-97.661659,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,4,60134862,78660,30.437368,-97.65686,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
df.columns

Index(['Unnamed: 0', 'zpid', 'zipcode', 'latitude', 'longitude',
       'propertyTaxRate', 'garageSpaces', 'hasAssociation', 'hasCooling',
       'hasGarage', 'hasHeating', 'hasSpa', 'hasView', 'parkingSpaces',
       'yearBuilt', 'latestPrice', 'numPriceChanges', 'latest_saledate',
       'latest_salemonth', 'latest_saleyear', 'numOfPhotos',
       'numOfAccessibilityFeatures', 'numOfAppliances', 'numOfParkingFeatures',
       'numOfPatioAndPorchFeatures', 'numOfSecurityFeatures',
       'numOfWaterfrontFeatures', 'numOfWindowFeatures',
       'numOfCommunityFeatures', 'lotSizeSqFt', 'livingAreaSqFt',
       'numOfPrimarySchools', 'numOfElementarySchools', 'numOfMiddleSchools',
       'numOfHighSchools', 'avgSchoolDistance', 'avgSchoolRating',
       'avgSchoolSize', 'MedianStudentsPerTeacher', 'numOfBathrooms',
       'numOfBedrooms', 'numOfStories', 'city_austin', 'city_del valle',
       'city_driftwood', 'city_dripping springs', 'city_manchaca',
       'city_manor', 'city_pflugerv

In [5]:
df.dtypes

Unnamed: 0                  int64
zpid                        int64
zipcode                     int64
latitude                  float64
longitude                 float64
                           ...   
homeType_Other              int64
homeType_Residential        int64
homeType_Single Family      int64
homeType_Townhouse          int64
homeType_Vacant Land        int64
Length: 61, dtype: object

In [6]:
df.isnull().sum()

Unnamed: 0                0
zpid                      0
zipcode                   0
latitude                  0
longitude                 0
                         ..
homeType_Other            0
homeType_Residential      0
homeType_Single Family    0
homeType_Townhouse        0
homeType_Vacant Land      0
Length: 61, dtype: int64

In [7]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column
df.drop("zpid", axis = 1, inplace = True)
df.drop("latest_saledate", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (15171, 59) 


In [8]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [9]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(11378, 58)

In [10]:
# Instantiate linear regression variable
model = LinearRegression()
# Use data to learn patterns (fitting or training)
model.fit(X_train, y_train)

LinearRegression()

In [11]:
# Create prediction
y_pred = model.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,211342.5,439900.0
1,656930.1,560000.0
2,476766.7,450000.0
3,293581.3,375000.0
4,1225553.0,2100000.0
5,598687.9,775000.0
6,461441.2,433000.0
7,324443.7,279500.0
8,836632.4,575000.0
9,1817842.0,2199000.0


In [12]:
for x in y_pred:
    if x < 0:
        print(x)

-154005.39793747663
-73725.76499450207
-50537.90175318718
-6081.769283175468
-2927.9521678090096
-84201.72525948286
-1252.3126467466354
-11727.888878285885
-2307.376609325409
-51153.94835329056
-24457.027043044567
-3785.255644738674
-75050.62338787317


In [13]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.7935680146214116 0.793513575806702


In [15]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

103243465630.90657
321315.2122618949


ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [None]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

163365.72787120758
