In [1]:
import pandas as pd
from pymongo import MongoClient


def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

In [2]:
def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [5]:
df = read_mongo(db="propertiesdb", collection="apartment", host='192.168.0.17')

In [6]:
df['point'] = df['address'].apply(lambda v: v.get('point') if isinstance(v, dict) else '')
df['link_url'] = df['link'].apply(lambda v: v.get('href') if isinstance(v, dict) else '')
df['point']

0       {'lon': -47.049274, 'source': 'GOOGLE', 'lat':...
1       {'lon': -47.049899, 'source': 'GOOGLE', 'lat':...
2       {'lon': -47.049899, 'source': 'GOOGLE', 'lat':...
3       {'lon': -47.04951, 'source': 'GOOGLE', 'lat': ...
4       {'lon': -47.049274, 'source': 'GOOGLE', 'lat':...
                              ...                        
5052    {'lon': -47.053892, 'source': 'GOOGLE', 'lat':...
5053    {'lon': -47.050091, 'source': 'GOOGLE', 'lat':...
5054    {'lon': -47.079115, 'source': 'GOOGLE', 'lat':...
5055    {'lon': -47.059443, 'source': 'GOOGLE', 'lat':...
5056    {'lon': -47.049693, 'source': 'GOOGLE', 'lat':...
Name: point, Length: 5057, dtype: object

In [7]:
df['lon'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lon') if isinstance(v, dict) else ''), errors='coerce')
df['lat'] = pd.to_numeric(df['point'].apply(lambda v: v.get('lat') if isinstance(v, dict) else ''), errors='coerce')

df['lat']

0      -22.942955
1      -22.944549
2      -22.944549
3      -22.942379
4      -22.942955
          ...    
5052   -22.882355
5053   -22.939591
5054   -22.893305
5055   -22.893871
5056   -22.852896
Name: lat, Length: 5057, dtype: float64

In [8]:
import numpy as np
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [9]:
lat_default, log_default = (-22.9388, -47.050991)
haversine_np(lat_default, log_default, -22.9388, -47.050991)

0.0

In [10]:
df['dist'] = \
    haversine_np(lat_default, log_default, df['lat'], df['lon'])

df[['dist', "title", "lon", "lat"]].sort_values('dist')

Unnamed: 0,dist,title,lon,lat
525,0.000000,"Apartamento residencial à venda, Parque Prado,...",-47.050991,-22.938800
460,0.000000,"Apartamento residencial à venda, Parque Prado,...",-47.050991,-22.938800
3860,0.000000,"APARTAMENTO RESIDENCIAL em CAMPINAS - SP, PARQ...",-47.050991,-22.938800
581,0.000000,Lindo Apartamento Cond Caapuã Parque Prado,-47.050991,-22.938800
571,0.000000,"Apartamento com 3 dormitórios à venda, 92 m² p...",-47.050991,-22.938800
...,...,...,...,...
3164,3435.257424,Otimo apartamento térreo com Garden 3 dormitór...,-22.937527,-47.061014
1351,3436.673589,"Apartamento com 3 dorms, 1 Suíte, 2 vagas, 103...",-22.938844,-47.088752
1497,3442.204807,Apartamento no Mansões Santo Antonio com 73 me...,-22.853068,-47.043980
3449,,Lindo apartamento oportunidade única,,


In [73]:
df2 = df[(df.dist == 0 )]
df1 = df[(df.id_vivareal == '2462027022')]
df2
#link__href
#df2[['dist', "title", "link_url", "price", "totalAreas"]].to_csv('output.csv', encoding='utf-8', index=False)

Unnamed: 0,displayAddressType,amenities,feedsId,contractType,usableAreas,constructionStatus,videoTourLink,listingType,description,title,...,similares_updated,deliveredAt,updated,logoUrl,price_version,point,link_url,lon,lat,dist
106,NEIGHBORHOOD,"[BARBECUE_GRILL, GATED_COMMUNITY, KITCHEN, GOU...",AP5058,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"Excelente apartamento com 3 suítes, muito bem ...","Apartamento residencial à venda, Parque Prado,...",...,2020-04-04 13:41:09.377763,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
107,NEIGHBORHOOD,"[BARBECUE_GRILL, ELEVATOR, GOURMET_BALCONY, POOL]",AP3852,REAL_ESTATE,[85],ConstructionStatus_NONE,,USED,"Lindo Apartamento no Parque Prado, rico em arm...","Apartamento com 3 dormitórios à venda, 85 m² p...",...,2020-04-04 13:42:22.409985,,2020-04-07 18:04:16.112380,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
197,NEIGHBORHOOD,"[AIR_CONDITIONING, BARBECUE_GRILL, GATED_COMMU...",AP3444,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,Lindo apartamento com excelente acabamento com...,Parque Prado l 119m³ l 3 suítes l 2 vagas,...,2020-04-03 20:28:16.058294,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
261,NEIGHBORHOOD,"[BARBECUE_GRILL, POOL, GOURMET_SPACE, FURNISHE...",AP00427,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"APARTAMENTO para Venda<br>PARQUE PRADO, CAMPIN...","APARTAMENTO RESIDENCIAL em CAMPINAS - SP, PARQ...",...,2020-04-03 21:24:38.006575,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-4-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
303,NEIGHBORHOOD,"[AIR_CONDITIONING, BARBECUE_GRILL, GATED_COMMU...",AP0728,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"Edifício Caapuã | apartamento andar alto, lind...","Apartamento residencial à venda, Parque Prado,...",...,2020-04-03 21:56:39.716394,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
309,NEIGHBORHOOD,"[BARBECUE_GRILL, POOL, GOURMET_SPACE, FURNISHE...",AP00765,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"APARTAMENTO para Venda<br>PARQUE PRADO, CAMPIN...","Caapua, lazer clube ao lado do Shopping Prado",...,2020-04-03 22:01:20.180746,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
324,NEIGHBORHOOD,"[BARBECUE_GRILL, GATED_COMMUNITY, KITCHEN, ELE...",AP7126,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"Lindo Apartamento no Parque Prado, 3 suítes, l...","Apartamento com 3 dormitórios à venda, 111 m² ...",...,2020-04-03 22:11:37.148539,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
330,NEIGHBORHOOD,"[BARBECUE_GRILL, GATED_COMMUNITY, KITCHEN, ELE...",AP8758,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"Belíssimo apartamento , vista para área de pre...","Apartamento com 3 dormitórios à venda, 111 m² ...",...,2020-04-03 22:16:10.394059,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
333,NEIGHBORHOOD,"[BARBECUE_GRILL, GATED_COMMUNITY, KITCHEN, ELE...",AP8987,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"Lindo apartamento 3 suítes, lavabo, sala de es...","Apartamento com 3 dormitórios à venda, 111 m² ...",...,2020-04-03 22:17:56.290702,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0
334,NEIGHBORHOOD,"[AIR_CONDITIONING, BARBECUE_GRILL, GATED_COMMU...",AP4752,REAL_ESTATE,[111],ConstructionStatus_NONE,,USED,"AMPLO APARTAMENTO COM 111 M² DE ÁREA ÚTIL, LOC...","Apartamento residencial à venda, Parque Prado,...",...,2020-04-03 22:19:00.303461,,,,,"{'lon': -47.050991, 'source': 'GOOGLE', 'lat':...",/imovel/apartamento-3-quartos-parque-prado-bai...,-47.050991,-22.9388,0.0


In [103]:
df1i = pd.DataFrame()

for index, row in df1.iterrows():
    arr1 = []
    for m in row['medias']:
        file_name = IMGFOLDER + m['new_url']
        img = PIL.Image.open(file_name)
        img_hash = imagehash.average_hash(img)
        arr1.append(img_hash)
        
    df1i = df1i.append(pd.DataFrame({'id_vivareal': row['id_vivareal'], 'img': arr1}))

df1i



Unnamed: 0,id_vivareal,img
0,2462027022,0303c387f7f3470f
1,2462027022,5f9f5b0303901014
2,2462027022,007070e0fc7e7c20
3,2462027022,1f1f1f1301406070
4,2462027022,0c0ee7321a08707d
5,2462027022,ffef02000028f06c
6,2462027022,607c7c6c60e0e0c0
7,2462027022,8bc3838383838383
8,2462027022,d8c1c058fcf8f8f8
9,2462027022,313171d19f7f3f3f


In [106]:
df2i = pd.DataFrame()

for index, row in df2.iterrows():
    arr1 = []
    for m in row['medias']:
        file_name = IMGFOLDER + m['new_url']
        img = PIL.Image.open(file_name)
        img_hash = imagehash.average_hash(img)
        arr1.append(img_hash)
        
    df2i = df1i.append(pd.DataFrame({'id_vivareal': row['id_vivareal'], 'img': arr1}))

df2i

Unnamed: 0,id_vivareal,img
0,2462027022,0303c387f7f3470f
1,2462027022,5f9f5b0303901014
2,2462027022,007070e0fc7e7c20
3,2462027022,1f1f1f1301406070
4,2462027022,0c0ee7321a08707d
5,2462027022,ffef02000028f06c
6,2462027022,607c7c6c60e0e0c0
7,2462027022,8bc3838383838383
8,2462027022,d8c1c058fcf8f8f8
9,2462027022,313171d19f7f3f3f


51

In [81]:
import requests
import shutil
import os
import datetime
import PIL
import imagehash
from os import listdir
from os.path import isfile, join

In [77]:
CUTOFF = 5

IMGFOLDER = '/home/dzanardo/github/apartamento/images_remote/'
arr1 = []
arr2 = []

for index, row in df1.iterrows():
    count_similar = 0
    for m in row['medias']:
        file_name = IMGFOLDER + m['new_url']
        img = PIL.Image.open(file_name)
        img_hash = imagehash.average_hash(img)
        arr1.append(img_hash)
        
for index2, row2 in df2.iterrows():
    for m2 in row2['medias']:
        file_name2 = IMGFOLDER + m2['new_url']
        img2 = PIL.Image.open(file_name2)
        img_hash2 = imagehash.average_hash(img2)
        arr2.append(img_hash2)
            


In [79]:
for ar1 in arr1:
    for ar2 in arr2:
        a = ar1 - ar2 < CUTOFF
