In [30]:
import json
from pymongo import MongoClient
from pprint import pprint
from PIL import Image
import requests
import random
import time, os
from tqdm import tqdm
import io
import multiprocessing
from joblib import Parallel, delayed
from pyzipcode import ZipCodeDatabase
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import pickle
import numpy
import shutil


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances


from keras import callbacks
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from keras.layers import Input, Flatten, Dense, GlobalAveragePooling2D
from keras.models import Model
from keras.applications.xception import Xception, preprocess_input, decode_predictions
from keras.models import load_model
m = load_model('room_classifier')

In [74]:
#need to repickle when classing is done
pickle_in = open("images_classed_arry/arry_features_exterior.pickle","rb")
arry_features_exterior = pickle.load(pickle_in, encoding='latin1')

pickle_in = open("images_classed_df/df_features_exterior.pickle","rb")
df_features_exterior = pickle.load(pickle_in)

pickle_in = open("images_classed_arry/arry_features_bedroom.pickle","rb")
arry_features_bedroom = pickle.load(pickle_in, encoding='latin1')

pickle_in = open("images_classed_df/df_features_bedroom.pickle","rb")
df_features_bedroom = pickle.load(pickle_in)

pickle_in = open("images_classed_arry/arry_features_bathroom.pickle","rb")
arry_features_bathroom = pickle.load(pickle_in, encoding='latin1')

pickle_in = open("images_classed_df/df_features_bathroom.pickle","rb")
df_features_bathroom = pickle.load(pickle_in)

pickle_in = open("images_classed_arry/arry_features_kitchen.pickle","rb")
arry_features_kitchen = pickle.load(pickle_in, encoding='latin1')

pickle_in = open("images_classed_df/df_features_kitchen.pickle","rb")
df_features_kitchen = pickle.load(pickle_in)

pickle_in = open("images_classed_arry/arry_features_living.pickle","rb")
arry_features_living = pickle.load(pickle_in, encoding='latin1')

pickle_in = open("images_classed_df/df_features_living.pickle","rb")
df_features_living = pickle.load(pickle_in)

In [95]:
client = MongoClient()
db = client.trulia_db
listings = db.listings.find_one({},{'_id':0, 'image_urls':0})

In [10]:
base_dir = 'input'
to_class = {0: 'bathroom',
 1: 'bedroom',
 2: 'exterior',
 3: 'kitchen',
 4: 'living',
 5: 'plan'}
IMG_WIDTH, IMG_HEIGHT = 299, 299 

datagen = ImageDataGenerator(rescale=1./255)

batch_size = 200

base_model = Xception(input_shape=(IMG_WIDTH, IMG_HEIGHT, 3), weights='imagenet', include_top=False)


# makes the prediction of the file path image passed as parameter 
def predict(file, model, to_class):
    im = load_img(file, target_size=(IMG_WIDTH, IMG_HEIGHT))
    x = img_to_array(im)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    index = model.predict(x).argmax()
    return to_class[index]

In [11]:
def class_input_df(input_fld):
    input_dir = os.listdir(input_fld)
    img_name_list = []
    img_predict_list = []
    
    for input_img in input_dir:
        img_name_list.append(input_img)
        img = input_fld+'/'+input_img
        img_predict_list.append(predict(img, m, to_class))
    comb_img = list(zip(img_name_list,img_predict_list))
    df = pd.DataFrame(comb_img, columns = ['img_name', 'img_class']) 
    return df


In [44]:
#appy this to all extract scripts
def extract_features(classes, sample_count):
    features = np.zeros(shape=(sample_count,10,10, 2048))
    generator = datagen.flow_from_directory(
        base_dir,
        target_size=(299, 299),
        shuffle=False,
        batch_size=batch_size,
        class_mode=None,
        classes=[classes])
    list_img = [img.split('/') for img in generator.filenames]
    img_df = pd.DataFrame(list_img, columns = ['class','img_name'])
    img_df['listingid'] = img_df['img_name'].apply(lambda x: x.split('_')[0])
    i = 0
    #pdb.set_trace()
    for inputs_batch in generator:
        features_batch = base_model.predict(inputs_batch)
        #pdb.set_trace()
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        i += 1
        if i * batch_size >= sample_count:
            # Note that since generators yield data indefinitely in a loop,
            # we must `break` after every image has been seen once.
            break
    features = numpy.array(features)
    features = np.reshape(features, (sample_count, 10 * 10 * 2048))
    return features


In [37]:
#moves inputs to right classes
def process_input_images(input_img_fld, output_img_fld):
    path = input_img_fld
    dest_path = output_img_fld
    for img in os.listdir(path):
        try:
            source_dir = path+"/"+str(img)
            room_class = predict(source_dir, m, to_class)
            dest_source = dest_path+"/"+room_class
            shutil.copy(src=source_dir,dst=dest_source)
        except:
            print('Failed: ' + source_dir)

In [128]:
#Text query
def get_close_zipcodes(zip_code,distance):
    zcdb = ZipCodeDatabase()
    in_radius = [z.zip for z in zcdb.get_zipcodes_around_radius(zip_code, distance)] # ('ZIP', radius in miles)
    return in_radius

def get_listings(zip_code, radius, min_price=0, max_price=999999999,property_type_in=None):
    if property_type_in is None:
        property_type = ['apartment','condo','coop','lot/land','mobile/manufactured','multi-family','single-family home','townhouse','unknown']
    else:
        property_type = [property_type_in]
    list_zip_codes = get_close_zipcodes(zip_code, radius)
    return db.listings.find({"$and":[
                                 {"zip_code" : {"$in" : list_zip_codes}},
                                 {"$expr": {"$gte": [ { "$toDouble": "$price" }, min_price ]}},
                                 {"$expr": {"$lte": [ { "$toDouble": "$price" }, max_price ]}},
                                 {"propertytype": {"$in" : property_type}}
                                ]}
                                 , {'_id':0,'image_urls':0})


In [205]:
#INSERT TEXT QUERY (zipcode, lower price, upper price, type of home ['apartment','condo','coop','lot/land','mobile/manufactured','multi-family','single-family home','townhouse','unknown'] )
results_json = get_listings(10314,200, 1.0,500000.0)
query_results = pd.DataFrame(results_json)
query_result_listingid = query_results['listingid']

In [210]:
#step 1, upload pics and break out into groups
process_input_images('input/input/upload','input/input/classed_upload')
input_df = class_input_df('input/input/upload')
base_dir = 'input/input/classed_upload'
df_main = pd.DataFrame(data=query_result_listingid)

#living
living_input_cnt = len(os.listdir(base_dir+"/living"))
if living_input_cnt >0: 
    input_living_features = extract_features('living', living_input_cnt)
    
    dist_living = pairwise_distances(arry_features_living,input_living_features,metric='cosine')
    df_dist_living = pd.DataFrame(data=dist_living)
    df_dist_living['avg'] = df_dist_living.mean(axis=1)
    df_dist_living_sum = df_dist_living.merge(df_features_living, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_living_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_living_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_living = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_living.columns = ['listingid', 'living_avg']
    
    df_main = df_main.merge(df_fin_living, how='outer', left_on='listingid', right_on='listingid')

#kitchen
kitchen_input_cnt = len(os.listdir(base_dir+"/kitchen"))
if kitchen_input_cnt >0: 
    input_kitchen_features = extract_features('kitchen', kitchen_input_cnt)
    
    dist_kitchen = pairwise_distances(arry_features_kitchen,input_kitchen_features,metric='cosine')
    df_dist_kitchen = pd.DataFrame(data=dist_kitchen)
    df_dist_kitchen['avg'] = df_dist_kitchen.mean(axis=1)
    df_dist_kitchen_sum = df_dist_kitchen.merge(df_features_kitchen, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_kitchen_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_kitchen_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_kitchen = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_kitchen.columns = ['listingid', 'kitchen_avg']

    df_main = df_main.merge(df_fin_kitchen, how='outer', left_on='listingid', right_on='listingid')


#exterior
exterior_input_cnt = len(os.listdir(base_dir+"/exterior"))
if exterior_input_cnt >0: 
    input_exterior_features = extract_features('exterior', exterior_input_cnt)
    
    dist_exterior = pairwise_distances(arry_features_exterior,input_exterior_features,metric='cosine')
    df_dist_exterior = pd.DataFrame(data=dist_exterior)
    df_dist_exterior['avg'] = df_dist_exterior.mean(axis=1)
    df_dist_exterior_sum = df_dist_exterior.merge(df_features_exterior, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_exterior_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_exterior_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_exterior = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_exterior.columns = ['listingid', 'exterior_avg']

    df_main = df_main.merge(df_fin_exterior, how='outer', left_on='listingid', right_on='listingid')


#bedroom
bedroom_input_cnt = len(os.listdir(base_dir+"/bedroom"))
if bedroom_input_cnt >0: 
    input_bedroom_features = extract_features('bedroom', bedroom_input_cnt)
    
    dist_bedroom = pairwise_distances(arry_features_bedroom,input_bedroom_features,metric='cosine')
    df_dist_bedroom = pd.DataFrame(data=dist_bedroom)
    df_dist_bedroom['avg'] = df_dist_bedroom.mean(axis=1)
    df_dist_bedroom_sum = df_dist_bedroom.merge(df_features_bedroom, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_bedroom_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_bedroom_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_bedroom = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_bedroom.columns = ['listingid', 'bedroom_avg']
    
    df_main = df_main.merge(df_fin_bedroom,how='outer', left_on='listingid', right_on='listingid')


#bathroom
bathroom_input_cnt = len(os.listdir(base_dir+"/bathroom"))
if bathroom_input_cnt >0: 
    input_bathroom_features = extract_features('bathroom', bathroom_input_cnt)
    
    dist_bathroom = pairwise_distances(arry_features_bathroom,input_bathroom_features,metric='cosine')
    df_dist_bathroom = pd.DataFrame(data=dist_bathroom)
    df_dist_bathroom['avg'] = df_dist_bathroom.mean(axis=1)
    df_dist_bathroom_sum = df_dist_bathroom.merge(df_features_bathroom, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_bathroom_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_bathroom_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_bathroom = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_bathroom.columns = ['listingid', 'bathroom_avg']
    
    df_main = df_main.merge(df_fin_bathroom,how='outer', left_on='listingid', right_on='listingid')

#plan
plan_input_cnt = len(os.listdir(base_dir+"/plan"))
if plan_input_cnt >0: 
    input_plan_features = extract_features('plan', plan_input_cnt)
    
    dist_plan = pairwise_distances(arry_features_plan,input_plan_features,metric='cosine')
    df_dist_plan = pd.DataFrame(data=dist_plan)
    df_dist_plan['avg'] = df_dist_plan.mean(axis=1)
    df_dist_plan_sum = df_dist_plan.merge(df_features_plan, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
    _ = df_dist_plan_sum['listingid'].isin(query_results["listingid"])
    df = df_dist_plan_sum[_].groupby(["listingid"]).apply(lambda x: x.sort_values(["avg"], ascending = False)).reset_index(drop=True)

    df2 = df.groupby('listingid').head(2)

    df_fin_plan = df2.groupby(['listingid'])['avg'].mean().reset_index()
    df_fin_plan.columns = ['listingid', 'plan_avg']
    
    df_main = df_main.merge(df_fin_plan, how='outer', left_on='listingid', right_on='listingid')




Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.
Found 1 images belonging to 1 classes.


In [216]:
df_main['avg'] = df_main.mean(axis=1)
df_main.sort_values('avg').head(50)

Unnamed: 0,listingid,kitchen_avg,exterior_avg,bedroom_avg,bathroom_avg,avg
3344,1032399192,0.506129,,,,0.506129
4523,1041886435,0.516442,,,,0.516442
531,1040039687,0.546675,,,,0.546675
2546,1029809449,0.548864,,,,0.548864
3458,5068785559,,,0.564916,,0.564916
4628,1070352788,,,0.625496,,0.625496
2359,1085364671,,,0.632388,,0.632388
6690,5069067839,,,,0.638692,0.638692
2777,5068930206,0.648014,,,,0.648014
3747,1084201444,,,0.65342,,0.65342
