In [1]:
# Generate affinity score between a company and location by unsupervised learning.
# Each building is represented by the features of companies inside it.
# It just like the task of face recognition, we take several photoes for each person and use these photoes to represent that person.


In [1]:
#describe each location with companies in side
import pandas as pd
import numpy as np
import os
import pygeohash as pgh
from math import *
from sklearn.metrics.pairwise import euclidean_distances
import random
import matplotlib.pyplot as plt
pjoin = os.path.join

In [2]:
#function_base
def splitdat(dat,key_column=['duns_number'],right_colunm='atlas_location_uuid_tr',rate_tr=0.8):
    """
    split the <company,location> pair into training/testing dat
    """
    tr = dat.sample(frac=rate_tr)
    tt = pd.merge(dat,tr,on=key_column,how='left',suffixes=['','_tr'])
    tt = tt[tt[right_colunm].isnull()]
    tt = tt[list(tr.columns)]
    print('Train dat:', len(tr), 'Test dat:', len(tt))
    return tr,tt

#data process
def onehotdat(dat,key_column:list):
    dat[key_column] = dat[key_column].astype(str)
    dum_dat = pd.get_dummies(dat[key_column],dummy_na=True)
    return dum_dat

def split2num(emp_range:str):
    max_emp_val = emp_range.replace(' ','').split('-')
    if len(max_emp_val)<2:
        return 10
    else:
        return float(max_emp_val[1])
    
def max_col(dat,col,minval=1):
    dat[col] = dat[col].apply(lambda r:max(r,minval))

def comp_dat_process(dat):
    """
    pd -> company key,cont_feature,spec_feature,dum_feature
    """
    one_hot_col_name = ['major_industry_category','location_type','primary_sic_2_digit']
    spec_col_name = 'emp_here_range'
    cont_col_name = ['emp_here','emp_total','sales_volume_us','square_footage']

    print('doing one-hot...')
    dum_dat = onehotdat(dat,one_hot_col_name)
    
    print('extract continuous...')
    cont_dat = dat[cont_col_name].fillna(value=0).astype(float)
    
    print('specific feature')
    spec_dat = dat[spec_col_name].fillna(value='1-10').astype(str)
    spec_dat = spec_dat.apply(lambda row: split2num(row))
    
    max_col(cont_dat,'emp_here',1)
    
    res_dat = dat[['duns_number']].join([cont_dat,spec_dat,dum_dat],how='left')
    assert(len(res_dat)==len(dum_dat))
    assert(len(res_dat)==len(cont_dat))
    assert(len(res_dat)==len(spec_dat))
    return res_dat

def comp_transpd2np(featdat,trdat,ttdat,not_col_name):
    tr_feat = pd.merge(trdat,featdat,on='duns_number',how='inner')
#     print(col_list)
    col_list = [ n for n in list(tr_feat.columns) if n not in not_col_name ] 
    trainX = tr_feat.loc[:,col_list].to_numpy()
    trainY = tr_feat[['atlas_location_uuid','longitude_loc','latitude_loc']].to_numpy()
    
    tt_feat = pd.merge(ttdat,featdat,on='duns_number',how='inner')
    col_list = [ n for n in list(tt_feat.columns) if n not in not_col_name ] 
#     print(col_list)
    testX = tt_feat.loc[:,col_list].to_numpy()
    testY = tt_feat[['atlas_location_uuid','longitude_loc','latitude_loc']].to_numpy()
    return trainX,trainY,testX,testY

def normalize_dat(trX,ttX,cols=5,axis=0):
    D = trX[:,:cols]
    center = D.mean(axis=axis)
    center = np.expand_dims(center,axis)
    scale = D.std(axis=axis)
    scale = np.expand_dims(scale,axis)
    
    trX[:,:cols] = (D-center)/scale
    ttX[:,:cols] = (ttX[:,:cols]-center)/scale
    
def normalize_dat_single(X,cols=5,axis=0):
    D = X[:,:cols]
    center = D.mean(axis=axis)
    center = np.expand_dims(center,axis)
    scale = D.std(axis=axis)
    scale = np.expand_dims(scale,axis)
    
    X[:,:cols] = (D-center)/scale
    return X,center,scale
    
def calc_topk_acc_v2(QRscore,y_truth_cat,R_cat,k=3):
    """
    QRscore: similarity score matrix shape [Q,R]
    y_truth: index(related with R) of truth label of Query
    """
    y_truth_cat = y_truth_cat.reshape(-1,1)
    max_k_preds = QRscore.argsort(axis=1)[:, -k:][:, ::-1] #得到top-k max label
    max_k_cat = R_cat[max_k_preds]
    match_array = np.logical_or.reduce(max_k_cat==y_truth_cat, axis=1) #得到匹配结果
    topk_acc_score = match_array.sum() / match_array.shape[0]
    return topk_acc_score

def calc_topk_acc_cat_all(QRscore,y_truth_cat,R_cat,k=3):
    """
    QRscore: similarity score matrix shape [Q,R]
    y_truth: index(related with R) of truth label of Query
    return top1-topk acc
    """
    res = []
    y_truth_cat = y_truth_cat.reshape(-1,1)
    max_k_preds = QRscore.argsort(axis=1)[:, -k:][:, ::-1] #得到top-k max label
    max_k_cat = R_cat[max_k_preds]
    M = max_k_cat==y_truth_cat
    for k in range(M.shape[1]):
        match_array = np.logical_or.reduce(M[:,:k+1], axis=1) #得到匹配结果
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res.append(topk_acc_score)
    return res

In [3]:
#data load
datapath = '/Users/yefeichen/Database/location_recommender_system/'
cfile = ['dnb_pa.csv','dnb_sf.csv','dnb_sj.csv']
lfile = 'location_scorecard_190912.csv'
clfile = ['PA.csv','SF.csv','SJ.csv']
savefile = ['PA_comp_loc_score.csv','SF_comp_loc_score.csv','SJ_comp_loc_score.csv']

ind_city = 0

pdc = pd.read_csv(pjoin(datapath,cfile[ind_city]))
pdl = pd.read_csv(pjoin(datapath,lfile))
pdcl = pd.read_csv(pjoin(datapath,clfile[ind_city]))

In [39]:
#generate cross feature of company and location for building
def comp_transpd2np_single(featdat,trdat,not_col_name):
    tr_feat = pd.merge(trdat,featdat,on='duns_number',how='inner')
    col_list = [ n for n in list(tr_feat.columns) if n not in not_col_name ] 
    X = tr_feat.loc[:,col_list].to_numpy()
    Y_comp = tr_feat[['duns_number']].to_numpy()
    Y_loc = tr_feat[['atlas_location_uuid']].to_numpy()
    
    return X,Y_comp,Y_loc

not_col_name = ['duns_number','atlas_location_uuid','geo_distance','longitude_loc','latitude_loc']

location_cnt = pdcl.groupby('atlas_location_uuid').first()
comp_num = pdcl.shape[0]
loc_num = len(location_cnt)
print('company number=%d, location number=%d'%(comp_num,loc_num))

print('dummy and get data feat')
proc_pdc = comp_dat_process(pdc)
print('transfer 2 numpy')
X,Y_comp,Y_loc = comp_transpd2np_single(proc_pdc,pdcl,not_col_name=not_col_name)
X,_,_ = normalize_dat_single(X,cols=5)
print('Y_comp_number =', Y_comp.shape)
print(X.shape)

print('data reformat')
# Y_comp_Y = np.tile(Y_comp,(1,Y_comp.shape[0])).reshape(-1,1)
# Y_loc_Y = np.tile(Y_loc,(1,Y_loc.shape[0])).transpose().reshape(-1,1)
# Y_comp = Y_comp.reshape(-1)


# assert(comp_num*comp_num==len(M))
resM = None
step = comp_num
n = 100
N = int(comp_num/n) + 1

for k in range(N):
    if k%10==1:
        print('percentage:%0.2f'%(k/N))
    inds = k*n
    inde = min((k+1)*n,comp_num)
    
    if inds >= comp_num:
        break
        
#     print('cal cross distance')
    distQR = euclidean_distances(X[inds:inde,:],X)
    distQR = distQR.reshape(-1,1)
#     print('data merge')
    Y_comp_tmp = Y_comp[inds:inde,:]
    #those things are very confusing. after all we struggled out.
    Y_comp_Y = np.tile(Y_comp_tmp,(1,Y_comp.shape[0])).reshape(-1,1)
    Y_loc_Y = np.tile(Y_loc,(1,Y_comp_tmp.shape[0])).transpose().reshape(-1,1)
    assert(Y_loc_Y.shape[0]==distQR.shape[0])
    assert(Y_comp_Y.shape[0]==distQR.shape[0])
    M = np.concatenate([Y_comp_Y,Y_loc_Y,distQR],axis=1)
    
#     print('create data')
    pdM = pd.DataFrame(data=M,columns=['duns_number','atlas_location_uuid','sim_score'])
    pdM['sim_score_new'] = pdM.sim_score.apply(lambda r: 1e6 if r<0.05 else r)

    
    if resM is None:
        resM = pdM.groupby(['duns_number','atlas_location_uuid'])[['sim_score_new']].min()
    else:
        crossFeatM = pdM.groupby(['duns_number','atlas_location_uuid'])[['sim_score_new']].min()
        resM = pd.concat([resM,crossFeatM],axis=0)
        
assert(comp_num*loc_num==resM.shape[0])

# resM.to_csv(savefile[ind_city])

company number=6219, location number=352
dummy and get data feat
doing one-hot...
extract continuous...
specific feature
transfer 2 numpy
Y_comp_number = (6219, 1)
(6219, 92)
data reformat


AttributeError: 'NoneType' object has no attribute 'shape'

In [5]:
resK = resM.groupby(['duns_number','atlas_location_uuid']).first()

In [11]:
len(resK),len(resM),comp_num*loc_num

(1029.0170454545455, 362214, 2189088)

In [17]:
k=1
inds = k*n
inde = min((k+1)*n,comp_num)
Y_comp_tmp = Y_comp[inds:inde,:]
Y_comp_Y = np.tile(Y_comp_tmp,(1,Y_comp.shape[0])).reshape(-1,1)

In [13]:
Y_comp.shape[0]

6219

In [17]:
Y_loc.shape

(6219, 1)

In [15]:
Y_comp_tmp.shape

(19, 1)

In [22]:
pdM

Unnamed: 0,duns_number,atlas_location_uuid,sim_score,sim_score_new
0,2.0359e+06,77c493ec-1424-6d74-8db3-ee8fce0092db,0,1000000.000000
1,2.0359e+06,77c493ec-1424-6d74-8db3-ee8fce0092db,1.41475,1.414752
2,2.0359e+06,7a687b49-1055-24f6-ae84-e390438fe3c4,1.41584,1.415839
3,2.0359e+06,d880cdef-04ae-7698-0499-5123172f6033,2.00029,2.000286
4,2.0359e+06,a8c3d177-3ca7-ab2a-f8cb-44b118ae8df4,1.41468,1.414679
5,2.0359e+06,5c80985b-d40b-8ce7-92f8-c786bb23fe54,2.47413,2.474129
6,2.0359e+06,d04d42cd-f145-79cd-294e-5079e0745411,1.42395,1.423948
7,2.0359e+06,7982ff34-35e3-cc76-e15a-9416f4306f63,1.41587,1.415867
8,2.0359e+06,7982ff34-35e3-cc76-e15a-9416f4306f63,1.41447,1.414474
9,2.0359e+06,b89c3096-5ebc-7491-2de8-79f22da62dbf,2.00251,2.002506


In [23]:
621900/6219

100.0

In [40]:
v = pdM.groupby(['duns_number','atlas_location_uuid'])[['sim_score_new']].min()

In [41]:
len(v)

35200

In [34]:
Y_loc_Y = np.tile(Y_loc,(1,Y_comp_tmp.shape[0])).transpose().reshape(-1,1)

In [37]:
z = np.tile(Y_loc,(1,Y_comp_tmp.shape[0]))
z.shape

(6219, 100)

In [38]:
Y_loc_Y

array([['77c493ec-1424-6d74-8db3-ee8fce0092db'],
       ['77c493ec-1424-6d74-8db3-ee8fce0092db'],
       ['7a687b49-1055-24f6-ae84-e390438fe3c4'],
       ...,
       ['c5b270a7-6368-6e77-6039-618cc709f3a6'],
       ['a0e26081-e098-a151-67d3-9b7194c5e467'],
       ['a0e26081-e098-a151-67d3-9b7194c5e467']], dtype=object)