<a href="https://colab.research.google.com/github/edwardmfho/Domain-API-Data-Exporter/blob/master/Property_Price_Estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# extract, export or import data
import requests
import json
from datetime import datetime

# data processing 
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Parameters
export_file = True
CAR_SPACE_NULL = True


# Search Criteria
# current_datetime = datetime.now().isoformat('T')
postcode = '2000'
minBedrooms = 0
minBathrooms = 0
minCarspaces = 0
maxPrice = 1200

# Credential
client_id = 'CLIENT_ID'
client_secret = 'CLIENT_SECRET'
scopes = ['api_listings_read']
headers = {
    'content-type': 'application/json',
}
auth_url = 'https://auth.domain.com.au/v1/connect/token'
url_endpoint = 'https://api.domain.com.au/v1/listings/residential/_search'


In [None]:
params = {
  "listingType":"Rent",
  "minBedrooms": minBedrooms,
  "minBathrooms":minBathrooms,
  "minCarspaces":minCarspaces,
  "maxPrice": maxPrice,
  "includeSurroundingSuburbs": True,
  "locations":[
    {
      "state":"",
      "region":"",
      "area":"",
      "suburb":"",
      "postCode":postcode,
    }
  ],
  "pageSize" : 200,
}

In [None]:
# getPropertyData.py

class getPropertyData:
  ''' Extract, clean and transform data from Domain.com.au. 
      
      Args:
        client_Id (str): client ID as provided by the Domain Development.
        client_secret (str): client secret as provided by the Domain Development.
      
      Methods:
        __init__ :
        retrieveData :

  '''
  def __init__(self, client_Id, client_secret):
    # client_id and client_secret
    
    self.client_Id = client_Id
    self.client_secret = client_secret

  def retrieveData(self, params):
    # get data from Domain.com.au
    response = requests.post(auth_url, data = {
                              'client_id':self.client_Id,
                              'client_secret':self.client_secret,
                              'grant_type':'client_credentials',
                              'scope':scopes,
                              'Content-Type':'text/json'
                              })
    token = response.json()
    access_token=token['access_token']
    auth = {'Authorization': 'Bearer ' + access_token}
    request = requests.post(url_endpoint, headers = auth, json=params)
    print('Status Code: ', request.status_code)
    if request.status_code == 200:
      print('Request Succeeded.')
    else:
      print('Error: ', print(request.status_code))
    df = request.json()
    df = pd.DataFrame.from_dict(pd.json_normalize(df), orient='columns')
    if export_file:
            df.to_csv('data.csv', index=False)
    return request, df

  def clean_data(self, df):
      # drop un-needed column in price estimator
      rm_cols = ['type',
                 'listing.listingType',
                 'listing.id',
                 'listing.advertiser.type',
                 'listing.advertiser.id',
                 'listing.advertiser.name',
                 'listing.advertiser.logoUrl',
                 'listing.advertiser.preferredColourHex',
                 'listing.advertiser.bannerUrl',
                 'listing.advertiser.contacts',
                 'listing.media',
                 'listing.propertyDetails.state',
                 'listing.propertyDetails.allPropertyTypes',
                 'listing.labels',
                 'listing.inspectionSchedule.byAppointment',
                 'listing.inspectionSchedule.recurring',
                 'listing.inspectionSchedule.times',
                 'listing.listingSlug',
                 'listing.priceDetails.price',
                 'listing.priceDetails.priceFrom',
                 'listing.priceDetails.priceTo',
                 'listing.propertyDetails.landArea',
                 'listing.propertyDetails.buildingArea',
                 ]
      df.drop(rm_cols, axis=1, inplace=True)
      
      # Remove non-number values from price
      df['listing.priceDetails.displayPrice'] = df['listing.priceDetails.displayPrice'].apply(lambda x: re.sub("[^0-9.]", "", x))
      
      # Remove HTML tags
      df['listing.summaryDescription'] = df['listing.summaryDescription'].apply(lambda x: BeautifulSoup(x, "lxml").text) 

      # If it is a Studio, replace bedroom number to 1
      df.loc[df['listing.propertyDetails.propertyType'] == 'Studio', ['listing.propertyDetails.bedrooms']] = 1 

      # If no infomation about car space, car space is set as 0
      df['listing.propertyDetails.carspaces'].fillna(0, inplace=True)

      if export_file:
        df.to_csv('cleaned_data.csv',index = False)
      
      return df

  def multi_labelbinarizier(self, df):
    # Create MultiLabelBinarizer object
    multi_one_hot = MultiLabelBinarizer()

    # One-hot encode data
    encode_col = 'listing.propertyDetails.features'
    

    transformed_results = multi_one_hot.fit_transform(df[encode_col])
    df_multilabel_data = pd.DataFrame(transformed_results, columns=multi_one_hot.classes_)
    new_df = pd.concat([df, df_multilabel_data], axis=1, sort=False)
    new_df.drop(encode_col, axis=1, inplace=True)

    return new_df

  def labelbinarizier(self, df):
    # Create LabelBinarizer object
    one_hot = LabelBinarizer()

    # One-hot encode data

    encode_col = ['listing.propertyDetails.features',
                  'listing.propertyDetails.area',
                  'listing.propertyDetails.region',
                  'listing.propertyDetails.suburb',
                  'listing.propertyDetails.postcode']
    
    
    for col in encode_col:

      df[col] = df[col].astype(str)
      transformed_results = one_hot.fit_transform(df[col])
      df_label_data = pd.DataFrame(transformed_results, columns=one_hot.classes_)
      new_df = pd.concat([df, df_label_data], axis=1, sort=False)
      new_df.drop(col, axis=1, inplace=True)
    # Drop non-encoded columns
    


    return new_df


In [None]:
domain = getPropertyData(client_id, client_secret)

In [None]:
_, df = domain.retrieveData(params)

Status Code:  200
Request Succeeded.


In [None]:
domain.clean_data(df)

Unnamed: 0,listing.priceDetails.displayPrice,listing.propertyDetails.features,listing.propertyDetails.propertyType,listing.propertyDetails.bathrooms,listing.propertyDetails.bedrooms,listing.propertyDetails.carspaces,listing.propertyDetails.unitNumber,listing.propertyDetails.streetNumber,listing.propertyDetails.street,listing.propertyDetails.area,listing.propertyDetails.region,listing.propertyDetails.suburb,listing.propertyDetails.postcode,listing.propertyDetails.displayableAddress,listing.propertyDetails.latitude,listing.propertyDetails.longitude,listing.headline,listing.summaryDescription,listing.hasFloorplan,listing.hasVideo
0,11001250,"[AirConditioning, BuiltInWardrobes, Ensuite, G...",ApartmentUnitFlat,2.0,2.0,2.0,5,180,Marine Parade,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"5/180 Marine Parade, Maroubra",-33.946030,151.256470,Absolute Beachfront Apartment. \nFurnished or ...,This beachfront apartment offers uninterrupted...,True,False
1,980,"[AirConditioning, BuiltInWardrobes, Ensuite, G...",ApartmentUnitFlat,3.0,3.0,2.0,,19,Shepherd Street,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"19 Shepherd Street, Maroubra",-33.937782,151.237930,Spacious Open Plan Three Bedroom Apartment,Spanning over two levels and with lift access ...,False,False
2,825,"[BuiltInWardrobes, Floorboards, SecureParking,...",House,1.0,3.0,2.0,,82,Alma Road,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"82 Alma Road, Maroubra",-33.940020,151.245911,Charming freestanding house close to beach and...,Well located between the shops and beach is th...,False,False
3,695,"[BuiltInWardrobes, Ensuite, InternalLaundry, S...",ApartmentUnitFlat,2.0,2.0,1.0,701,128,Banks Avenue,Eastern Suburbs,Sydney Region,PAGEWOOD,2035,"701/128 Banks Avenue, Pagewood",-33.943096,151.223724,Enviable new north facing apartment on 7th fl...,"Meticulously finished, this north facing apart...",False,False
4,700,"[InternalLaundry, SecureParking, Dishwasher]",ApartmentUnitFlat,1.0,2.0,1.0,5,363,Malabar Rd,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"5/363 Malabar Rd, Maroubra",-33.940178,151.256300,Please Register For Inspection,Ideally located just minutes from Maroubra Bea...,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,525,"[BuiltInWardrobes, Gas, SwimmingPool, Bath]",ApartmentUnitFlat,2.0,2.0,1.0,32,108,Boyce Road,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"32/108 Boyce Road, Maroubra",-33.939503,151.237762,TWO BEDROOM APARTMENT WITH PARKING IN THE HEAR...,"Situated in the heart of Maroubra, this spacio...",False,False
196,520,"[BuiltInWardrobes, InternalLaundry, SecurePark...",ApartmentUnitFlat,2.0,2.0,1.0,12,112,Boyce Road,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"12/112 Boyce Road, Maroubra",-33.939712,151.238100,IDEALLY LOCATED,Two Bedroom Apartment ideally located in the h...,False,False
197,,"[AirConditioning, BuiltInWardrobes, Ensuite, F...",ApartmentUnitFlat,1.0,1.0,1.0,A208,42,Page Street,Eastern Suburbs,Sydney Region,PAGEWOOD,2035,"A208/42 Page Street, Pagewood",-33.942780,151.212311,"1bedroom, 1bathroom, 1carspaces within the new...","208/42 Page Street, Pagewood\r\n\r\nExplore th...",True,False
198,520,"[Gas, Intercom, Heating, Dishwasher]",ApartmentUnitFlat,1.0,1.0,1.0,8,59-65,Chester Avenue,Eastern Suburbs,Sydney Region,MAROUBRA,2035,"8/59-65 Chester Avenue, Maroubra",-33.951324,151.243256,Serene Garden Apartment,Set in tranquil tropical gardens with an idyll...,True,False


In [None]:
new_df = domain.multi_labelbinarizier(df)


In [None]:
encode_col = ['listing.propertyDetails.propertyType',
              'listing.propertyDetails.area',
              'listing.propertyDetails.region',
              'listing.propertyDetails.suburb',
              'listing.propertyDetails.postcode']


for col in encode_col:
  dummies_df = pd.get_dummies(new_df[col], drop_first = True)
  new_df = pd.concat([new_df, dummies_df], axis=1, sort=False)

In [None]:
drop_cols = ['listing.propertyDetails.unitNumber',
           'listing.propertyDetails.streetNumber',
           'listing.propertyDetails.street',
           'listing.propertyDetails.displayableAddress',
           'listing.propertyDetails.latitude',
           'listing.propertyDetails.longitude',
           'listing.propertyDetails.propertyType',
           'listing.propertyDetails.area',
           'listing.propertyDetails.region',
           'listing.propertyDetails.suburb',
           'listing.propertyDetails.postcode',
           'listing.headline',
           'listing.summaryDescription',
           'listing.hasFloorplan',
           'listing.hasVideo',]


new_df.drop(drop_cols, axis=1, inplace=True)

In [None]:
new_df['listing.priceDetails.displayPrice'] = new_df['listing.priceDetails.displayPrice'].replace("", np.nan).fillna(99999)

In [None]:
new_df['listing.priceDetails.displayPrice'] = new_df['listing.priceDetails.displayPrice'].replace(".", np.nan).fillna(99999)

In [None]:
new_df['listing.priceDetails.displayPrice'] = new_df['listing.priceDetails.displayPrice'].astype(float)

In [None]:
for ind in range(len(new_df['listing.priceDetails.displayPrice'])):
  _value = new_df.at[ind, 'listing.priceDetails.displayPrice']
  if  _value > 1150:
    new_df.at[ind, 'listing.priceDetails.displayPrice'] = np.nan


new_df.dropna(subset=['listing.priceDetails.displayPrice'], axis=0, inplace=True)

In [None]:
new_df

Unnamed: 0,listing.priceDetails.displayPrice,listing.propertyDetails.bathrooms,listing.propertyDetails.bedrooms,listing.propertyDetails.carspaces,AirConditioning,AlarmSystem,BalconyDeck,Bath,BroadbandInternetAccess,BuiltInWardrobes,CableOrSatellite,CityViews,Dishwasher,DoubleGlazedWindows,EnergyEfficientAppliances,Ensuite,Floorboards,FullyFenced,Furnished,GardenCourtyard,Gas,GroundFloor,Gym,Heating,IndoorSpa,Intercom,InternalLaundry,NorthFacing,OutdoorSpa,PetsAllowed,SecureParking,SeparateDiningRoom,Shed,Study,SwimmingPool,TennisCourt,WallCeilingInsulation,WaterEfficientAppliances,WaterEfficientFixtures,WaterViews,Duplex,House,Studio,Terrace,Townhouse,MAROUBRA,PAGEWOOD
1,980.0,3.0,3.0,2.0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,825.0,1.0,3.0,2.0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,695.0,2.0,2.0,1.0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,700.0,1.0,2.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
5,650.0,2.0,2.0,1.0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,525.0,1.0,1.0,1.0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
195,525.0,2.0,2.0,1.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
196,520.0,2.0,2.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
198,520.0,1.0,1.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection

In [None]:
X = new_df
X = X.drop(columns=['listing.priceDetails.displayPrice'])
y = new_df['listing.priceDetails.displayPrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import BayesianRidge, LinearRegression

In [None]:
clf = BayesianRidge(compute_score=True)
clf.fit(X_train, y_train)

ols = LinearRegression()
ols.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
seed = 7
kfold = model_selection.KFold(n_splits=10,)
model = LinearRegression()
scoring = 'r2'
results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
print("MSE: {} ({})".format(results.mean(), results.std()))

MSE: -1.0396379351811442 (1.8748038245076986)


In [None]:
new_df[0:1]

Unnamed: 0,listing.priceDetails.displayPrice,listing.propertyDetails.bathrooms,listing.propertyDetails.bedrooms,listing.propertyDetails.carspaces,AirConditioning,AlarmSystem,BalconyDeck,Bath,BroadbandInternetAccess,BuiltInWardrobes,CableOrSatellite,CityViews,Dishwasher,DoubleGlazedWindows,EnergyEfficientAppliances,Ensuite,Floorboards,FullyFenced,Furnished,GardenCourtyard,Gas,GroundFloor,Gym,Heating,IndoorSpa,Intercom,InternalLaundry,NorthFacing,OutdoorSpa,PetsAllowed,SecureParking,SeparateDiningRoom,Shed,Study,SwimmingPool,TennisCourt,WallCeilingInsulation,WaterEfficientAppliances,WaterEfficientFixtures,WaterViews,Duplex,House,Studio,Terrace,Townhouse,MAROUBRA,PAGEWOOD
1,980.0,3.0,3.0,2.0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
result = ols.predict(X_test[0:5])

In [None]:
result

array([ 579.916957  ,  663.76172034, 1106.69140296,  580.59828258,
        609.17290692])

In [None]:
y_test[0:5]

180    560.0
169    580.0
118    700.0
150    600.0
164    600.0
Name: listing.priceDetails.displayPrice, dtype: float64