In [5]:
import re
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
import sklearn
import sys
from geopy.distance import great_circle

In [28]:
class PricePredictorModel:
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_price(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
        
        return X_new, self.model.predict(X_new)
    
    # Functions
    
    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates(subset='listing_id')

        # Drop observations we are not interested in
        df = df[df.property_type != 'Land']
        df = df[df.property_type != 'Parking/garage']
        df = df[df.property_type != 'Barn conversion']
        df = df[df.property_type != 'Office']
        df = df[df.property_type != 'Farm']
        df = df[df.property_type != 'Equestrian property']
        df = df[df.property_type != 'Mobile/park home']
        df = df[df.property_type != 'Retail premises']
        df = df[df.property_type != 'Block of flats']
        df = df[df.property_type != 'Hotel/guest house']

        # Remove properties with more than 6 bedrooms
        df = df[df.num_bedrooms <= 6]
        
        # Remove more than 5 bathrooms
        df = df[df.num_bathrooms <= 5]
        
        # Remove price outliers
        df=df[df.price <=1500000]
        
        # Join bungalows as they are sparse classes into 1
        df.property_type.replace(['Terraced bungalow','Detached bungalow',
                                  'Semi-detached bungalow'], 'Bungalow', inplace=True)
        
        # Combine other sparse classes into Other
        df.property_type.replace(['Cottage','Villa', 'Link-detached house', 
                                  'Lodge', 'Town house', 'Chalet',
                                  'Mews house','Farmhouse', 'Country house','Studio'], 'Other', inplace=True)

# Drop not needed
        df = df.drop(['thumbnail_url', 'street_name', 'status', 
                 'letting_fees', 'last_published_date',
                 'image_caption', 'image_80_60_url', 'image_645_430_url', 'image_50_38_url',
                 'image_354_255_url', 'image_150_113_url', 'floor_plan', 'displayable_address', 
                 'details_url', 'country_code', 'country', 'category', 'agent_phone', 'agent_phone',
                 'agent_logo', 'agent_address', 'Unnamed: 0', 'price_change', 
                 'price_change_summary.last_updated_date', 'listing_status', 'agent_address',
                 'agent_logo'], axis=1)
        
        # Label missing categorical data
        for column in df.select_dtypes(include=['object']):
            df[column] = df[column].fillna('Missing')

        # Return cleaned dataframe
        return df
    
    def engineer_features(self, df):
        
        # Create indicator variable for properties with 2 beds and 2 baths
        df['two_and_two'] = ((df.num_bedrooms == 2) & (df.num_bathrooms == 2)).astype(int)
        
        # Create heating type feature
        heating_type = []
        gas = ['gas central heating', 'GCH', 'Gas heating', 'gas heating', 'Gas central heating', 
               'Gas Central Heating', 'gas combi boiler']
        oil = ['oil central heating', 'Oil heating', 'oil heating', 'Oil central heating']
        electric = ['electric central heating', 'electric wet system central heating', 
                    'electric storage heating', 'Electric heating', 'ECH', 'Electric central heating', 
                    'electric heating', 'electric night storage', 'Electric Heating', 'Economy 7',
                    'economy 7']
        for row in df['description']:
            if any( word in row for word in gas):
                heating_type.append('gas')
            elif any( word in row for word in oil):
                heating_type.append('oil')
            elif any( word in row for word in electric):
                heating_type.append('electric')
            else:
                heating_type.append('Missing')
        df['heating_type'] = heating_type
        
        # Double glazing feature
        double_glazing = []
        keyword = ['double glazing', 'Double Glazing', 'Double glazing', 'D\G', 'D/G', 'DG']
        for row in df['description']:
            if any( word in row for word in keyword):
                double_glazing.append('1')
            else:
                double_glazing.append('0')
        df['double_glazing'] = double_glazing
        
        # Renovation feature        
        renovated = []
        keyword = ['Refurbished', 'refurbished', 'renovated', 'Renovated', 'Redecorated', 'redecorated']
        for row in df['description']:
            if any( word in row for word in keyword):
                renovated.append('1')
            else:
                renovated.append('0')
        df['renovated'] = renovated

        # Age of the ad
        df['today'] = pd.to_datetime(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        df['date_published'] = pd.to_datetime(df['first_published_date'])
        df['ad_age_days'] = (df['today'] - df['date_published']).dt.days
        
        # Remove % sign from price change
        df['price_change_summary.percent'] = df['price_change_summary.percent'].str.replace('%', '')
        # Remove redundant features
        df = df.drop(['first_published_date', 'price_change_summary.direction', 'today','date_published'], axis=1)
        
        # Rename price change percent column so python doesn't complain about dot in the name
        df = df.rename(columns={'price_change_summary.percent': 'price_change_percent'})

        # Replace missing percent data with 0 as missing data in this column = no change in price
        df.price_change_percent.replace('Missing', 0, inplace=True)
        
        # New home feature cleanup
        df.new_home.replace('Missing', 0, inplace=True)
        df.new_home.replace('True', 1, inplace=True)
        
        # Distance to Edinburgh instead of county and town
        edinburgh_coord = (55.957070, -3.236022)
        distance = []
        for lat, lon in zip(df['latitude'], df['longitude']):
            loc_coord = (lat, lon)
            distance.append(great_circle(edinburgh_coord, loc_coord).miles)
        df['distance_edi'] = distance
        
        # Remove images and descriptions
        df = df.drop(['agent_name', 'description', 'outcode','short_description', 'latitude',
                      'longitude', 'post_town', 'county', 'image_url'], axis=1)
        
        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=['price_modifier', 'property_type','heating_type'])
        # Return augmented DataFrame
        return df

In [29]:
price_model = PricePredictorModel('data/final_model_with_features.pkl')

In [30]:
with open('data/final_model_with_features.pkl', 'rb') as f:
    model = pickle.load(f)
model

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gradientboostingregressor', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, mi...        presort='auto', random_state=123, subsample=1.0, verbose=0,
             warm_start=False))])

In [31]:
raw_data = pd.read_csv('data/unseen_raw_data.csv', low_memory=False)

In [34]:
images = raw_data['image_url']
cleaned_data = price_model.clean_data(raw_data)
feature = price_model.engineer_features(cleaned_data)
pd.set_option('display.max_columns', 100)
feature.head()



Unnamed: 0,listing_id,new_home,num_bathrooms,num_bedrooms,num_floors,num_recepts,price,price_change_percent,two_and_two,double_glazing,renovated,ad_age_days,distance_edi,price_modifier_Missing,price_modifier_equity_loan,price_modifier_fixed_price,price_modifier_from,price_modifier_guide_price,price_modifier_offers_in_region_of,price_modifier_offers_over,price_modifier_price_on_request,property_type_Bungalow,property_type_Detached house,property_type_End terrace house,property_type_Flat,property_type_Maisonette,property_type_Missing,property_type_Other,property_type_Semi-detached house,property_type_Terraced house,heating_type_Missing,heating_type_electric,heating_type_gas,heating_type_oil
0,44232074,0,0,3,0,0,225000,-4.2,0,1,0,75,66.947867,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,43711120,0,1,3,0,2,146000,0.0,0,0,0,125,66.767178,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,44647755,0,2,2,0,1,499500,0.0,1,0,0,32,1.867951,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0
3,41841980,True,0,4,0,0,575000,0.0,0,0,0,334,35.052825,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,20416803,0,2,4,2,3,185000,-35.0,0,1,0,2184,66.464615,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [35]:
y = feature['price']
X = feature.drop('price', axis=1)
X.head()
df, pred = price_model.predict_price(X, clean=False, augment=False)


In [36]:
df['predicted'] = pred
df['actual'] = y
df['image_url'] = images

In [37]:
df.head()

Unnamed: 0,listing_id,new_home,num_bathrooms,num_bedrooms,num_floors,num_recepts,price_change_percent,two_and_two,double_glazing,renovated,ad_age_days,distance_edi,price_modifier_Missing,price_modifier_equity_loan,price_modifier_fixed_price,price_modifier_from,price_modifier_guide_price,price_modifier_offers_in_region_of,price_modifier_offers_over,price_modifier_price_on_request,property_type_Bungalow,property_type_Detached house,property_type_End terrace house,property_type_Flat,property_type_Maisonette,property_type_Missing,property_type_Other,property_type_Semi-detached house,property_type_Terraced house,heating_type_Missing,heating_type_electric,heating_type_gas,heating_type_oil,predicted,actual,image_url
0,44232074,0,0,3,0,0,-4.2,0,1,0,75,66.947867,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,154669.982448,225000,https://li.zoocdn.com/55ecae8cfc3330cb0e069566...
1,43711120,0,1,3,0,2,0.0,0,0,0,125,66.767178,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,164484.486845,146000,https://li.zoocdn.com/7443de02fb197a5775d433e4...
2,44647755,0,2,2,0,1,0.0,1,0,0,32,1.867951,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,282803.566327,499500,https://lid.zoocdn.com/354/255/7f3fb9a7115689d...
3,41841980,True,0,4,0,0,0.0,0,0,0,334,35.052825,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,331839.958578,575000,https://lid.zoocdn.com/354/255/db9055219ca8078...
4,20416803,0,2,4,2,3,-35.0,0,1,0,2184,66.464615,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,460676.566887,185000,https://li.zoocdn.com/820669315b936f6f28ef3dec...


In [39]:
df.head(100).to_csv('data/properties.csv', index=None)