In [2]:
import json
from pymongo import MongoClient
from pprint import pprint
from PIL import Image
import requests
import random
import time, os
from tqdm import tqdm
import io
import multiprocessing
from joblib import Parallel, delayed
from pyzipcode import ZipCodeDatabase
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances


from keras import callbacks
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from keras.layers import Input, Flatten, Dense, GlobalAveragePooling2D
from keras.models import Model
from keras.applications.xception import Xception, preprocess_input, decode_predictions
from keras.models import load_model
m = load_model('room_classifier')

Using TensorFlow backend.


In [3]:
#!pip install pyzipcode3 

In [4]:
client = MongoClient()
db = client.trulia_db
listings = db.listings.find_one({},{'_id':0, 'image_urls':0})
listings

{'address': '564 Golfview Ct, Staten Island, NY 10314',
 'bath': '2ba',
 'bed': '2bd',
 'city': 'Staten Island',
 'description': ['joann dellarocca the listing agent is available to show this property at your convenience my cell is 718-614-2676 a must see! well maintained 2 bedroom duplex townhouse, walking distance to mall, x buses, excellent school district. enter into a large living room/dining room, eat in kitchen, bath, laundry room, 2nd level maste bedroom with sliders to terrace with park views, 2nd bedroom, full bath, attic with storage. great location. enjoy your summers at the pool and tennis courts.'],
 'floorspace': '1,600 sqft',
 'listingid': '5068903154',
 'locationlat': '40.5807',
 'price': '399000',
 'propertytype': 'townhouse',
 'state': 'NY',
 'url': 'https://www.trulia.com/p/ny/staten-island/564-golfview-ct-staten-island-ny-10314--2008862537',
 'zip_code': '10314'}

In [5]:
to_class = { 0: 'bathroom',
             1: 'bedroom',
             2: 'exterior',
             3: 'kitchen',
             4: 'living',
             5: 'plan'}

In [6]:
def get_close_zipcodes(zip_code,distance = 2):
    zcdb = ZipCodeDatabase()
    in_radius = [z.zip for z in zcdb.get_zipcodes_around_radius(zip_code, distance)] # ('ZIP', radius in miles)
    return in_radius

In [7]:
def get_listings(zip_code, min_price=0, max_price=999999999,property_type_in=None):
    if property_type_in is None:
        property_type = ['apartment','condo','coop','lot/land','mobile/manufactured','multi-family','single-family home','townhouse','unknown']
    else:
        property_type = [property_type_in]
    list_zip_codes = get_close_zipcodes(zip_code)
    return db.listings.find({"$and":[
                                 {"zip_code" : {"$in" : list_zip_codes}},
                                 {"$expr": {"$gte": [ { "$toDouble": "$price" }, min_price ]}},
                                 {"$expr": {"$lte": [ { "$toDouble": "$price" }, max_price ]}},
                                 {"propertytype": {"$in" : property_type}}
                                ]}
                                 , {'_id':0,'image_urls':0})


In [8]:
results_json = get_listings(10314,400000.0,500000.0, 'townhouse')
data = pd.DataFrame(results_json)
data

Unnamed: 0,address,bath,bed,city,description,floorspace,listingid,locationlat,price,propertytype,state,url,zip_code
0,"53 Shirra Ave, Staten Island, NY 10314",2ba,3bd,Staten Island,"[20135H-Look no further! Move-in-Ready, End Un...","1,123 sqft",5068834143,40.591385,499000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/53-s...,10314
1,"7 Stratford Ct, Staten Island, NY 10314",3ba,3bd,Staten Island,[WELCOME TO 7 STRATFORD COURT! This home has a...,"1,986 sqft",5066932021,40.588326,459000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/7-st...,10314
2,"20 Peggy Ln, Staten Island, NY 10306",2ba,2bd,Staten Island,[CALL LISTING AGENT MARIYA 917-592-3790 .Gorge...,"1,480 sqft",3125415303,40.572086,439000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/20-p...,10306
3,"181 Lamped Loop #A, Staten Island, NY 10314",2ba,3bd,Staten Island,[Beautiful 3 Bedroom Condo . Updated baths & K...,"1,318 sqft",5063367859,40.584667,424900,townhouse,NY,https://www.trulia.com/p/ny/staten-island/181-...,10314
4,"65 Watchogue Rd, Staten Island, NY 10314",2ba,3bd,Staten Island,[Well maintained and flooded with natural ligh...,750 sqft,5068137289,40.613914,488000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/65-w...,10314
5,"33 Timothy Ct, Staten Island, NY 10314",2ba,3bd,Staten Island,[recently renovated single family attached. fe...,"1,920 sqft",3136502415,40.60615,479900,townhouse,NY,https://www.trulia.com/p/ny/staten-island/33-t...,10314
6,"38 Lewiston St, Staten Island, NY 10314",3ba,3bd,Staten Island,[&#x1f3e1; Location! Move right in to this bea...,"1,760 sqft",5057119693,40.587097,499000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/38-l...,10314
7,"607 Lincoln Ave, Staten Island, NY 10306",2ba,2bd,Staten Island,[custom work throughout. two new oak staircase...,"1,000 sqft",3084346133,40.571133,449000,townhouse,NY,https://www.trulia.com/p/ny/staten-island/607-...,10306
8,"96 Belmar Dr W, Staten Island, NY 10314",2ba,3bd,Staten Island,"[Spacious 2 level, 3 bedrooms, 2 bath home wit...","1,242 sqft",5068127460,40.579613,438888,townhouse,NY,https://www.trulia.com/p/ny/staten-island/96-b...,10314
9,"4004 Victory Blvd, Staten Island, NY 10314",5ba,5bd,Staten Island,[20114h-2 family attached townhouse. 6 over 3 ...,"2,050 sqft",5068465726,40.593323,479900,townhouse,NY,https://www.trulia.com/p/ny/staten-island/4004...,10314


In [9]:
IMG_WIDTH = 299
IMG_HEIGHT = 299
def vectorize_img(file, to_class, model = m):
    im = load_img(file, target_size=(IMG_WIDTH, IMG_HEIGHT))
    x = img_to_array(im)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    #x = x.flatten
    x = np.array(x) #reshape 2d
    return x

In [11]:
print(vectorize_img('input/bathroom/bath_0.jpg',to_class).shape)

(1, 299, 299, 3)


In [72]:
# img_list = [img for img in os.listdir('input')]
# img_list.remove('.ipynb_checkpoints')
# img_df = pd.DataFrame(img_list, columns = ['image_name'])
# img_df['image_vect'] = img_df['image_name'].apply(lambda x: vectorize_img('input/'+x,to_class))
        
#df.apply(lambda row: row.a + row.b, axis=1)
# input_image = predict('input/bath_0.jpg',to_class)
# d = pairwise_distances(img_df['image_vect'],input_image,metric='cosine')


In [10]:
#m.layers.pop()
m.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 149, 149, 32) 0           block1_conv1_bn[0][0]            
____________________________________________________________________________________________

In [None]:
# m.trainable = False
# set_trainable = False
# for layer in m.layers:
#     if layer.name == 'block14_sepconv2':
#         set_trainable = True
#     if set_trainable:
#         layer.trainable = True
#     else:
#         layer.trainable = False

In [15]:
classes = 'bathroom'
sample_count = 55
features = np.zeros(shape=(sample_count,10,10, 2048))
generator = datagen.flow_from_directory(
    base_dir,
    target_size=(299, 299),
    shuffle=False,
    batch_size=batch_size,
    class_mode=None,
    classes=[classes])
list_img = [img.split('/') for img in generator.filenames]
bathroom_img_df = pd.DataFrame(list_img, columns = ['class','img_name'])
bathroom_img_df['listingid'] = bathroom_img_df['img_name'].apply(lambda x: x.split('_')[0])
i = 0
for inputs_batch in generator:
    features_batch = base_model.predict(inputs_batch)
    #pdb.set_trace()
    features[i * batch_size : (i + 1) * batch_size] = features_batch
    i += 1
    if i * batch_size >= sample_count:
            # Note that since generators yield data indefinitely in a loop,
            # we must `break` after every image has been seen once.
            break

Found 55 images belonging to 1 classes.


In [16]:
bathroom_img_df

Unnamed: 0,class,img_name,listingid
0,bathroom,000_0.jpg,0
1,bathroom,111_1.jpg,111
2,bathroom,222_2.jpg,222
3,bathroom,5045779034_13.jpg,5045779034
4,bathroom,5045779034_14.jpg,5045779034
5,bathroom,5067941346_18.jpg,5067941346
6,bathroom,5067962609_22.jpg,5067962609
7,bathroom,5067970149_13.jpg,5067970149
8,bathroom,5068012887_2.jpg,5068012887
9,bathroom,5068018549_13.jpg,5068018549


In [26]:
list_img = [img.split('/') for img in generator.filenames]
img_df = pd.DataFrame(list_img, columns = ['class','img_name'])
img_df['listingid'] = img_df['img_name'].apply(lambda x: x.split('_')[0])
img_df

Unnamed: 0,class,img_name,listingid
0,bathroom,000_0.jpg,0
1,bathroom,111_1.jpg,111
2,bathroom,222_2.jpg,222


In [14]:
import pdb
base_dir = 'input'
#bathroom_dir = os.path.join(base_dir, 'bathroom')
datagen = ImageDataGenerator(rescale=1./255)
batch_size = 200

base_model = Xception(input_shape=(IMG_WIDTH, IMG_HEIGHT, 3), weights='imagenet', include_top=False)


def extract_features(classes, sample_count):
    features = np.zeros(shape=(sample_count,10,10, 2048))
    generator = datagen.flow_from_directory(
        base_dir,
        target_size=(299, 299),
        shuffle=False,
        batch_size=batch_size,
        class_mode=None,
        classes=[classes])
    i = 0
    #pdb.set_trace()
    for inputs_batch in generator:
        features_batch = base_model.predict(inputs_batch)
        #pdb.set_trace()
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        i += 1
        if i * batch_size >= sample_count:
            # Note that since generators yield data indefinitely in a loop,
            # we must `break` after every image has been seen once.
            break
    return features

bathroom_features = extract_features('bathroom', 55)

# import pickle
# pickle_out = open("bathroom_features.pickle","wb")
# pickle.dump(bathroom_img_df, pickle_out)
# pickle_out.close()


#reshape into 2 
#generator.filenames
# with open("doc_topic_nmf.pkl", "rb") as f:
#     doc_topic_nmf = pickle.load(f)
# with open("nmf_model.pkl", "wb") as f:
#     pickle.dump(nmf_model, f)

Found 55 images belonging to 1 classes.


In [17]:
import pickle
pickle_out = open("bathroom_features.pickle","wb")
pickle.dump(bathroom_img_df, pickle_out)
pickle_out.close()

In [20]:
pickle_in = open("features_bathroom.pickle","rb")
example_dict = pickle.load(pickle_in)
example_dict

Unnamed: 0,class,img_name,listingid
0,bathroom,000_0.jpg,0
1,bathroom,111_1.jpg,111
2,bathroom,222_2.jpg,222
3,bathroom,5045779034_13.jpg,5045779034
4,bathroom,5045779034_14.jpg,5045779034
5,bathroom,5067941346_18.jpg,5067941346
6,bathroom,5067962609_22.jpg,5067962609
7,bathroom,5067970149_13.jpg,5067970149
8,bathroom,5068012887_2.jpg,5068012887
9,bathroom,5068018549_13.jpg,5068018549


In [21]:
bathroom_features.shape

(55, 10, 10, 2048)

In [22]:
input_features = extract_features('input',5)

Found 5 images belonging to 1 classes.


In [23]:
input_features.shape

(5, 10, 10, 2048)

In [24]:
bathroom_features = np.reshape(bathroom_features, (55, 10 * 10 * 2048))
input_features = np.reshape(input_features, (5, 10 * 10 * 2048))

In [30]:
bathroom_features

array([[-0., -0., -0., ..., -0., -0., -0.],
       [-0., -0., -0., ..., -0., -0., -0.],
       [-0., -0., -0., ..., -0., -0., -0.],
       ...,
       [-0., -0., -0., ..., -0., -0., -0.],
       [-0., -0., -0., ..., -0., -0., -0.],
       [-0., -0., -0., ..., -0., -0., -0.]])

In [25]:
d = pairwise_distances(bathroom_features,input_features,metric='cosine')

In [26]:
d

array([[8.47072726e-01, 7.53792941e-01, 9.13749316e-01, 7.91699657e-01,
        7.54951657e-15],
       [8.62885108e-01, 7.80699133e-01, 9.19911273e-01, 7.97435125e-01,
        4.16703077e-01],
       [8.59791442e-01, 7.05252431e-01, 8.99519422e-01, 7.67709343e-01,
        3.59195520e-01],
       [8.78383148e-01, 7.54848726e-01, 8.93655333e-01, 7.48982467e-01,
        3.96880871e-01],
       [8.19070215e-01, 7.90854954e-01, 9.08779306e-01, 8.41894211e-01,
        5.91148273e-01],
       [8.82639436e-01, 8.53532681e-01, 8.49601460e-01, 8.78696850e-01,
        8.76196822e-01],
       [8.26800500e-01, 7.87087046e-01, 8.96413520e-01, 7.82986051e-01,
        7.49534279e-01],
       [8.45918905e-01, 8.23765638e-01, 8.02280907e-01, 8.34150541e-01,
        8.88606146e-01],
       [8.35604089e-01, 7.72433414e-01, 8.92920590e-01, 8.18573777e-01,
        7.50939827e-01],
       [8.23539956e-01, 7.48550042e-01, 9.31935469e-01, 7.41453027e-01,
        4.11615920e-01],
       [8.35229241e-01, 6.5998

In [27]:
# df = pd.DataFrame(data=d, columns=["signal"])
# df.sort_values('signal')
df = pd.DataFrame(data=d)
df

Unnamed: 0,0,1,2,3,4
0,0.847073,0.753793,0.913749,0.7917,7.549517e-15
1,0.862885,0.780699,0.919911,0.797435,0.4167031
2,0.859791,0.705252,0.899519,0.767709,0.3591955
3,0.878383,0.754849,0.893655,0.748982,0.3968809
4,0.81907,0.790855,0.908779,0.841894,0.5911483
5,0.882639,0.853533,0.849601,0.878697,0.8761968
6,0.826801,0.787087,0.896414,0.782986,0.7495343
7,0.845919,0.823766,0.802281,0.834151,0.8886061
8,0.835604,0.772433,0.892921,0.818574,0.7509398
9,0.82354,0.74855,0.931935,0.741453,0.4116159


In [29]:
df_merged = df.merge(example_dict, how='inner', left_index=True, right_index=True, left_on=None, right_on=None)
df_merged

Unnamed: 0,0,1,2,3,4,class,img_name,listingid
0,0.847073,0.753793,0.913749,0.7917,7.549517e-15,bathroom,000_0.jpg,0
1,0.862885,0.780699,0.919911,0.797435,0.4167031,bathroom,111_1.jpg,111
2,0.859791,0.705252,0.899519,0.767709,0.3591955,bathroom,222_2.jpg,222
3,0.878383,0.754849,0.893655,0.748982,0.3968809,bathroom,5045779034_13.jpg,5045779034
4,0.81907,0.790855,0.908779,0.841894,0.5911483,bathroom,5045779034_14.jpg,5045779034
5,0.882639,0.853533,0.849601,0.878697,0.8761968,bathroom,5067941346_18.jpg,5067941346
6,0.826801,0.787087,0.896414,0.782986,0.7495343,bathroom,5067962609_22.jpg,5067962609
7,0.845919,0.823766,0.802281,0.834151,0.8886061,bathroom,5067970149_13.jpg,5067970149
8,0.835604,0.772433,0.892921,0.818574,0.7509398,bathroom,5068012887_2.jpg,5068012887
9,0.82354,0.74855,0.931935,0.741453,0.4116159,bathroom,5068018549_13.jpg,5068018549


In [None]:
def 