In [1]:
import pandas as pd
import numpy as np
import os
import pygeohash as pgh
from math import *
pjoin = os.path.join

In [2]:
#data load
datapath = '/Users/yefeichen/Database/location_recommender_system/'
cfile = ['dnb_pa.csv','dnb_sf.csv','dnb_sj.csv']
lfile = 'location_scorecard_190912.csv'

pdc1 = pd.read_csv(pjoin(datapath,cfile[0]))
pdc2 = pd.read_csv(pjoin(datapath,cfile[1]))
pdc3 = pd.read_csv(pjoin(datapath,cfile[2]))

# pdc = pd.concat([pdc1,pdc2,pdc3],axis=0)
pdl = pd.read_csv(pjoin(datapath,lfile))

In [3]:
#city filter
def cityfilter(datComp,datLoc):
    city = datComp.groupby(['physical_city'],as_index=False)['physical_city'].agg({'cnt':'count'})
    print(len(city))
    pdatLoc = pd.merge(datLoc,city,how='inner',left_on = ['city'],right_on=['physical_city'],suffixes=['_loc','_comp'])
    return pdatLoc

def geohash(data,precision=6):
    data['geohash'] = data.apply(lambda row:pgh.encode(row['longitude'],row['latitude'],precision=precision),axis=1)


def geo_distance(lng1,lat1,lng2,lat2):
    lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2])
    dlon=lng2-lng1
    dlat=lat2-lat1
    a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 
    dis=2*asin(sqrt(a))*6371*1000
    return dis

def duplicateCheck(data,colname:str)->bool:
    R = data.groupby([colname],as_index=False)[colname].agg({'cnt':'count'})
    R = R[R['cnt']>1]
    if len(R)>0:
        print('duplicate detected')
        return False
    else:
        print('not duplicate')
        return True
    
def calcLinkTable(datComp,datLoc,verbose=True):
    if not verbose:
        print('merging...')
    df_cartesian = pd.merge(datComp, datLoc,on = 'geohash', how='outer',suffixes=['_comp','_loc'])
    if not verbose:
        print(list(df_cartesian.columns))
        print(len(df_cartesian))
        print('calc geo dist...')
    df_cartesian['geo_distance']=df_cartesian.apply(lambda row:geo_distance(row['longitude_comp'],row['latitude_comp'],row['longitude_loc'],row['latitude_loc']),axis=1)
    if not verbose:
        print('sort geo dist')
    df_cartesian_min_distance=df_cartesian.sort_values(by="geo_distance").groupby(["duns_number"],as_index=False).first()

    result = df_cartesian_min_distance[['duns_number','atlas_location_uuid','geo_distance']]
    if not verbose:
        duplicateCheck(result , 'atlas_location_uuid')
    return result

def fuzzy_geosearch(datComp,datLoc,precision=[8,7,6,5],thresh=[500,1000,1000,1000]):
    print('Initial company num:',len(datComp))
    datLoc_city = cityfilter(datComp,datLoc)
    print(len(datComp),len(datLoc_city))
    datComp_city = datComp[['duns_number','longitude','latitude']]
    datLoc_city = datLoc_city[['atlas_location_uuid','longitude','latitude']]
    datlist = []
    
    for i,p in enumerate(precision):
        print('level:',p)
        geohash(datComp_city,p)
        geohash(datLoc_city,p)
        linkCL = calcLinkTable(datComp_city,datLoc_city)
        datlist.append(linkCL[linkCL['geo_distance'] <= thresh[i]])
        unmatched = linkCL[linkCL['geo_distance'] > thresh[i]].groupby('duns_number',as_index=False).first()
        datComp_city = pd.merge(datComp_city,unmatched['duns_number'],on='duns_number',how='inner')
        print('datComp_city:',len(datComp_city))
        
    res = pd.concat(datlist,axis=0,ignore_index=True)
    print('Initial company num:',len(datComp), 'vs. Remain company num:', len(res) , 'rate:=', float(len(res))/len(datComp) )
    return res
        

In [4]:
linkCL1 = fuzzy_geosearch(pdc1,pdl)
linkCL1.to_csv(pjoin(datapath,'PA.csv'),index = None, header=True)

linkCL2 = fuzzy_geosearch(pdc2,pdl)
linkCL2.to_csv(pjoin(datapath,'SF.csv'),index = None, header=True)

linkCL3 = fuzzy_geosearch(pdc3,pdl)
linkCL3.to_csv(pjoin(datapath,'SJ.csv'),index = None, header=True)

Initial company num: 7538
1
7538 442
level: 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


datComp_city: 2672
level: 7
datComp_city: 1370
level: 6
datComp_city: 292
level: 5
datComp_city: 102
Initial company num: 7538 vs. Remain company num: 6219 rate:= 0.8250198991775006
Initial company num: 67849
1
67849 1845
level: 8
datComp_city: 22981
level: 7
datComp_city: 8036
level: 6
datComp_city: 2331
level: 5
datComp_city: 2014
Initial company num: 67849 vs. Remain company num: 56490 rate:= 0.8325841206207902
Initial company num: 48377
1
48377 670
level: 8
datComp_city: 20940
level: 7
datComp_city: 13443
level: 6
datComp_city: 5192
level: 5
datComp_city: 4199
Initial company num: 48377 vs. Remain company num: 25040 rate:= 0.5176013394795047
