In [None]:
import pandas as pd
from pymongo import MongoClient


def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

In [80]:
def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [81]:
ids = ['75007517', '2464774082', '1041254421', '2476424816', '2443225876', '2476740635', '2454100925', '2460138077', '2476904013', '1038740426', '2472568298']
#df = read_mongo(db="propertiesdb", collection="apartment", host='localhost', query={"id_vivareal" : {"$in" : ids}})
df = read_mongo(db="propertiesdb", collection="apartment", host='localhost')

In [82]:
df['point'] = df['address'].apply(lambda v: v.get('point') if isinstance(v, dict) else '')
df['link_url'] = df['link'].apply(lambda v: v.get('href') if isinstance(v, dict) else '')
df['link_url'] = 'https://www.vivareal.com.br' + df['link_url']

In [83]:
df['lon'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lon') if isinstance(v, dict) else ''), errors='coerce')
df['lat'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lat') if isinstance(v, dict) else ''), errors='coerce')

In [84]:
import numpy as np
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [85]:
item = df[(df.id_vivareal == '2454100925')]
lat_default, log_default = item['lat'].values[0], item['lon'].values[0]
haversine_np(lat_default, log_default, -22.9388, -47.050991)

0.0

In [86]:
df['dist'] = haversine_np(lat_default, log_default, df['lat'], df['lon'])

df[['dist', "id_vivareal", "title", "lon", "lat"]].sort_values('dist')

Unnamed: 0,dist,id_vivareal,title,lon,lat
98,0.000000,2454308977,apartamento - Parque Prado - Campinas,-47.050991,-22.938800
841,0.000000,2467958369,"Apartamento com 3 dormitórios à venda, 87 m² p...",-47.050991,-22.938800
170,0.000000,2462027022,Excelente Apartamento Parque Prado,-47.050991,-22.938800
833,0.000000,2475423197,apartamento - Parque Prado - Campinas,-47.050991,-22.938800
167,0.000000,2433721919,Lindo apartamento no bairro Vista Prado,-47.050991,-22.938800
...,...,...,...,...,...
457,7.291257,2467960707,Apartamento Resort Brisa da Mata - 3 dormitóri...,-47.002649,-23.003882
243,3434.018232,2441260297,Excelente apartamento térreo disponível para v...,-22.944882,-47.051056
201,3434.018232,2439522014,Excelente Apartamento Terreo a venda tem 80 m2...,-22.944882,-47.051056
289,3435.257424,2440710174,Otimo apartamento térreo com Garden 3 dormitór...,-22.937527,-47.061014


In [89]:
df_filtered = df[(df.dist <= 1)].copy()

len(df_filtered)

884

In [93]:
import imagehash
import PIL

IMGFOLDER = '/home/dzanardo/github/apartamento/images/'

def hash_image(file):
    print('-------------')
    print(file)
    file_name = IMGFOLDER + file
    img = PIL.Image.open(file_name)
    return imagehash.average_hash(img)

In [None]:
df_filtered['images_hash'] = [print(type(img)) for img in (imgs for imgs in df['images'])]

In [45]:
import PIL
import imagehash
import timeit

IMGFOLDER = '/home/dzanardo/github/apartamento/images/'
part1 = 0
part2 = 0
count = 0

images = {}

for index, row in df_filtered.iterrows():
    for m in row['medias']:
        try:
            start = timeit.default_timer()

            file_name = IMGFOLDER + m['new_url']
            img = PIL.Image.open(file_name)

            stop = timeit.default_timer()
            part1 += stop - start


            start = timeit.default_timer()

            img_hash = imagehash.average_hash(img)
            images[m['new_url']] = img_hash

            stop = timeit.default_timer()
            part2 += stop - start

            count += 1
        except:
            pass

print(count, part1, part2)

27113 2.962491199024953 62.29410781832121


In [42]:
CUTOFF = 5

for index, row in df_filtered.iterrows():
    count_s = 0
    count_medias = 0
    for m in row['medias']:
        if m['new_url'] in images:
            count_medias += 1
            for im in item['medias'].values[0]:
                 if im['new_url'] in images:           
                    if (images[m['new_url']] - images[im['new_url']]) < CUTOFF:
                        count_s += 1
                        break
                
    df_filtered.loc[index,'count_similar'] = count_s
    df_filtered.loc[index,'count_medias'] = count_medias
    

In [43]:
df_filtered2 = df_filtered[(df_filtered.count_similar >1)]
df_filtered2 = df_filtered2.sort_values(by=['count_similar'], ascending=False)

In [44]:
cols = ['id_vivareal', 'link_url', 'count_similar', 'count_medias', 'createdAt', 'portal', 
        'updatedAt', 'address', 'totalAreas', 'status', 'price',
         'updated', 'lon', 'lat', 'dist']

df_filtered2[(cols)].to_csv('similares.csv', sep=',', encoding='utf-8')