In [1]:
import pandas as pd
from pymongo import MongoClient


def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

In [2]:
def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [3]:
ids = ['75007517', '2464774082', '1041254421', '2476424816', '2443225876', '2476740635', '2454100925', '2460138077', '2476904013', '1038740426', '2472568298']
#df = read_mongo(db="propertiesdb", collection="apartment", host='localhost', query={"id_vivareal" : {"$in" : ids}})
df = read_mongo(db="propertiesdb", collection="apartment", host='localhost')

In [4]:
df['point'] = df['address'].apply(lambda v: v.get('point') if isinstance(v, dict) else '')
df['link_url'] = df['link'].apply(lambda v: v.get('href') if isinstance(v, dict) else '')
df['link_url'] = 'https://www.vivareal.com.br' + df['link_url']

In [5]:
df['lon'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lon') if isinstance(v, dict) else ''), errors='coerce')
df['lat'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lat') if isinstance(v, dict) else ''), errors='coerce')

In [6]:
import numpy as np
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [7]:
item = df[(df.id_vivareal == '2454100925')]
lat_default, log_default = item['lat'].values[0], item['lon'].values[0]
haversine_np(lat_default, log_default, -22.9388, -47.050991)

0.0

In [8]:
df['dist'] = haversine_np(lat_default, log_default, df['lat'], df['lon'])

df[['dist', "id_vivareal", "title", "lon", "lat"]].sort_values('dist')

Unnamed: 0,dist,id_vivareal,title,lon,lat
98,0.000000,2454308977,apartamento - Parque Prado - Campinas,-47.050991,-22.938800
49,0.000000,2029235481,apartamento - Parque Prado - Campinas,-47.050991,-22.938800
167,0.000000,2433721919,Lindo apartamento no bairro Vista Prado,-47.050991,-22.938800
696,0.000000,2469921496,apartamento - Parque Prado - Campinas,-47.050991,-22.938800
363,0.000000,2476944607,Apartamento para venda Eco Life/Parque Parado/...,-47.050991,-22.938800
...,...,...,...,...,...
974,7.706035,2471465375,Apartamento para venda com 76 metros quadrados...,-47.083569,-22.848926
243,3434.018232,2441260297,Excelente apartamento térreo disponível para v...,-22.944882,-47.051056
201,3434.018232,2439522014,Excelente Apartamento Terreo a venda tem 80 m2...,-22.944882,-47.051056
284,3435.257424,2440708289,Otimo apartamento térreo com Garden varanda Go...,-22.937527,-47.061014


In [9]:
df_filtered = df[(df.dist <= 1)].copy()

len(df_filtered)

884

In [12]:
import imagehash
import PIL

IMGFOLDER = '/home/dzanardo/github/apartamento/images/'

def hash_image(file):
    print('-------------')
    print(file)
    file_name = IMGFOLDER + file
    img = PIL.Image.open(file_name)
    return imagehash.average_hash(img)

In [14]:
import PIL
import imagehash
import timeit

IMGFOLDER = '/home/dzanardo/github/apartamento/images/'
part1 = 0
part2 = 0
count = 0

images = {}

for index, row in df_filtered.iterrows():
    for m in row['images']:
        try:
            start = timeit.default_timer()

            file_name = IMGFOLDER + m
            img = PIL.Image.open(file_name)

            stop = timeit.default_timer()
            part1 += stop - start


            start = timeit.default_timer()

            img_hash = imagehash.average_hash(img)
            images[m] = img_hash

            stop = timeit.default_timer()
            part2 += stop - start

            count += 1
        except:
            pass

print(count, part1, part2)

21499 2.6070275623351336 48.917305582610425


In [16]:
CUTOFF = 5

for index, row in df_filtered.iterrows():
    count_s = 0
    count_medias = 0
    for m in row['images']:
        if m in images:
            count_medias += 1
            for im in item['images'].values[0]:
                 if im in images:           
                    if (images[m] - images[im]) < CUTOFF:
                        count_s += 1
                        break
                
    df_filtered.loc[index,'count_similar'] = count_s
    df_filtered.loc[index,'count_medias'] = count_medias
    

In [17]:
df_filtered2 = df_filtered[(df_filtered.count_similar >1)]
df_filtered2 = df_filtered2.sort_values(by=['count_similar'], ascending=False)

In [19]:
cols = ['id_vivareal', 'link_url', 'count_similar', 'count_medias', 'createdAt', 'portal', 
        'updatedAt', 'address', 'totalAreas', 'status', 'price',
         'updated', 'lon', 'lat', 'dist']

import json

print (json.dumps(df_filtered2[(cols)].to_dict(orient='list')))
# df_filtered2[(cols)].to_csv('similares.csv', sep=',', encoding='utf-8')
# df_filtered2[(cols)].to_json()

{"id_vivareal": ["2454100925", "2449863607", "2473269978", "1810755482", "2460138077", "2457273721", "2475950575", "2475918222", "1042656352", "84521826", "1041448052", "2439114591", "2462507006", "1038662147", "2454308977", "2467957146", "2479060387", "1473825520", "84580675", "2472027085"], "link_url": ["https://www.vivareal.com.br/imovel/apartamento-3-quartos-parque-prado-bairros-campinas-com-garagem-92m2-venda-RS545000-id-2454100925/", "https://www.vivareal.com.br/imovel/apartamento-3-quartos-parque-prado-bairros-campinas-com-garagem-92m2-venda-RS580000-id-2449863607/", "https://www.vivareal.com.br/imovel/apartamento-3-quartos-parque-prado-bairros-campinas-com-garagem-92m2-venda-RS590000-id-2473269978/", "https://www.vivareal.com.br/imovel/apartamento-3-quartos-jardim-nova-europa-bairros-campinas-com-garagem-92m2-venda-RS585000-id-1810755482/", "https://www.vivareal.com.br/imovel/apartamento-3-quartos-parque-prado-bairros-campinas-com-garagem-92m2-venda-RS545000-id-2460138077/", "h