In [18]:
# Initial imports
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Import data
df = pd.read_csv(Path('../resources/regressiondata.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,zpid,zipcode,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,...,homeType_Apartment,homeType_Condo,homeType_Mobile / Manufactured,homeType_MultiFamily,homeType_Multiple Occupancy,homeType_Other,homeType_Residential,homeType_Single Family,homeType_Townhouse,homeType_Vacant Land
0,0,111373431,78660,30.430632,-97.663078,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,120900430,78660,30.432673,-97.661697,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2,2,2084491383,78660,30.409748,-97.639771,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,3,120901374,78660,30.432112,-97.661659,1.98,2,1,1,1,...,0,0,0,0,0,0,0,1,0,0
4,4,60134862,78660,30.437368,-97.65686,1.98,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
df.columns

Index(['Unnamed: 0', 'zpid', 'zipcode', 'latitude', 'longitude',
       'propertyTaxRate', 'garageSpaces', 'hasAssociation', 'hasCooling',
       'hasGarage', 'hasHeating', 'hasSpa', 'hasView', 'parkingSpaces',
       'yearBuilt', 'latestPrice', 'numPriceChanges', 'latest_saledate',
       'latest_salemonth', 'latest_saleyear', 'numOfPhotos',
       'numOfAccessibilityFeatures', 'numOfAppliances', 'numOfParkingFeatures',
       'numOfPatioAndPorchFeatures', 'numOfSecurityFeatures',
       'numOfWaterfrontFeatures', 'numOfWindowFeatures',
       'numOfCommunityFeatures', 'lotSizeSqFt', 'livingAreaSqFt',
       'numOfPrimarySchools', 'numOfElementarySchools', 'numOfMiddleSchools',
       'numOfHighSchools', 'avgSchoolDistance', 'avgSchoolRating',
       'avgSchoolSize', 'MedianStudentsPerTeacher', 'numOfBathrooms',
       'numOfBedrooms', 'numOfStories', 'city_austin', 'city_del valle',
       'city_driftwood', 'city_dripping springs', 'city_manchaca',
       'city_manor', 'city_pflugerv

In [5]:
df.dtypes

Unnamed: 0                  int64
zpid                        int64
zipcode                     int64
latitude                  float64
longitude                 float64
                           ...   
homeType_Other              int64
homeType_Residential        int64
homeType_Single Family      int64
homeType_Townhouse          int64
homeType_Vacant Land        int64
Length: 61, dtype: object

In [6]:
df.isnull().sum()

Unnamed: 0                0
zpid                      0
zipcode                   0
latitude                  0
longitude                 0
                         ..
homeType_Other            0
homeType_Residential      0
homeType_Single Family    0
homeType_Townhouse        0
homeType_Vacant Land      0
Length: 61, dtype: int64

In [7]:
# Save 'zpid'
id_df = df['zpid']

# Drop 'zpid' column
df.drop("zpid", axis = 1, inplace = True)
df.drop("latest_saledate", axis = 1, inplace = True)

# Check data size after dropping the 'Id' variable
print("\nData size: {} ".format(df.shape)) 


Data size: (15171, 59) 


In [8]:
# Create features and target
y = df["latestPrice"]
X = df.drop(columns=['latestPrice'])

In [9]:
# Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(11378, 58)

In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [12]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Create a random forest classifier
rf_model = RandomForestRegressor(n_estimators=50, random_state=72)

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [25]:
# Making predictions using the testing data
y_pred = rf_model.predict(X_test_scaled)

In [26]:
# Calculate R squared and Adjusted R Square
import statsmodels.api as sm
result = sm.OLS(y_pred, y_test).fit()
print(result.rsquared, result.rsquared_adj)

0.8566906368781894 0.8566528443246235


In [27]:
# Calculate Mean Squared Error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_squared_log_error(y_test, y_pred))

71011397939.01585
266479.63888262806
0.07456459930019997


In [28]:
# Calculate Mean Absolute Error(MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

95794.09488004218
