In [1]:
#describe each location with companies in side
import pandas as pd
import numpy as np
import os
import pygeohash as pgh
from math import *
from sklearn.metrics.pairwise import euclidean_distances
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from matplotlib import pylab
from sklearn.preprocessing import normalize
pjoin = os.path.join

In [3]:
#function_base
def getPosNegdat(dat):
    """
    dat: pos pair of data (location,company,geo,distance)
    return pos/neg pair of data, same structure of dat except one more column for label
    """
    shuffle_dat = dat.sample(frac=1).reset_index(drop=True)

    # shuffle_dat.head()

    twin_dat = dat.join(shuffle_dat,how='left',lsuffix='_left',rsuffix='_right')
    twin_dat = twin_dat[twin_dat['atlas_location_uuid_left'] != twin_dat['atlas_location_uuid_right']]
    print(len(twin_dat))
    twin_dat.head()

    neg_datA = twin_dat[['duns_number_left','atlas_location_uuid_right','longitude_loc_right','latitude_loc_right']]
    neg_datA = neg_datA.rename(columns={'duns_number_left':'duns_number','atlas_location_uuid_right':'atlas_location_uuid','longitude_loc_right':'longitude_loc','latitude_loc_right':'latitude_loc'})

    neg_datB = twin_dat[['duns_number_right','atlas_location_uuid_left','longitude_loc_left','latitude_loc_left']]
    neg_datB = neg_datB.rename(columns={'duns_number_right':'duns_number','atlas_location_uuid_left':'atlas_location_uuid','longitude_loc_left':'longitude_loc','latitude_loc_left':'latitude_loc'})

    neg_dat = pd.concat([neg_datA,neg_datB],axis=0)
    neg_dat['label'] = 0
    dat['label'] = 1
    res_dat = pd.concat([dat[['duns_number','atlas_location_uuid','longitude_loc','latitude_loc','label']],neg_dat],axis=0)
    print('Neg dat num:',len(neg_dat),';Pos dat num:',len(dat))
    return res_dat

def splitdat(dat,key_column=['duns_number'],right_colunm='atlas_location_uuid_tr',rate_tr=0.8):
    """
    split the <company,location> pair into training/testing dat
    """
    tr = dat.sample(frac=rate_tr)
    tt = pd.merge(dat,tr,on=key_column,how='left',suffixes=['','_tr'])
    tt = tt[tt[right_colunm].isnull()]
    tt = tt[list(tr.columns)]
    print('Train dat:', len(tr), 'Test dat:', len(tt))
    return tr,tt

#data process
def onehotdat(dat,key_column:list,dummy_na=True):
    dat[key_column] = dat[key_column].astype(str)
    dum_dat = pd.get_dummies(dat[key_column],dummy_na=dummy_na)#it has nan itself
    return dum_dat

def split2num(emp_range:str):
    max_emp_val = emp_range.replace(' ','').split('-')
    if len(max_emp_val)<2:
        return 10
    else:
        return float(max_emp_val[1])
    
def max_col(dat,col,minval=1):
    dat[col] = dat[col].apply(lambda r:max(r,minval))

def comp_dat_process(dat):
    """
    pd -> company key,cont_feature,spec_feature,dum_feature
    """
    one_hot_col_name = ['major_industry_category','location_type','primary_sic_2_digit']
    spec_col_name = 'emp_here_range'
    cont_col_name = ['emp_here','emp_total','sales_volume_us','square_footage']

    print('doing one-hot...')
    dum_dat = onehotdat(dat,one_hot_col_name)
    
    print('extract continuous...')
    cont_dat = dat[cont_col_name].fillna(value=0).astype(float)
    
    print('specific feature')
    spec_dat = dat[spec_col_name].fillna(value='1-10').astype(str)
    spec_dat = spec_dat.apply(lambda row: split2num(row))
    
    max_col(cont_dat,'emp_here',1)
    
    res_dat = dat[['duns_number']].join([cont_dat,spec_dat,dum_dat],how='left')
    assert(len(res_dat)==len(dum_dat))
    assert(len(res_dat)==len(cont_dat))
    assert(len(res_dat)==len(spec_dat))
    return res_dat

def location_dat_process(dat):
    """
    pd -> location key,cont_feature,dum_feature
    """
    one_hot_col_name = ['building_class']
    cont_col_name = ['score_predicted_eo','score_employer','num_emp_weworkcore','num_poi_weworkcore',
                     'pct_wwcore_employee','pct_wwcore_business','num_retail_stores','num_doctor_offices',
                     'num_eating_places','num_drinking_places','num_hotels','num_fitness_gyms',
                     'population_density','pct_female_population','median_age','income_per_capita',
                     'pct_masters_degree','walk_score','bike_score']

    print('doing one-hot...')
    dum_dat = onehotdat(dat,one_hot_col_name,False)
    print(len(dum_dat))
    
    print('extract continuous...')
    cont_dat = dat[cont_col_name].fillna(value=0).astype(float)
    print(len(cont_dat))
    
    res_dat = dat[['atlas_location_uuid']].join([cont_dat,dum_dat],how='left')
    print(len(res_dat))
    assert(len(res_dat)==len(dum_dat))
    assert(len(res_dat)==len(cont_dat))
    return {'data':res_dat,
            'cont_feat_num':len(list(cont_dat.columns)),
            'dum_feat_num':len(list(dum_dat.columns))}

def comp_transpd2np(featdat,trdat,ttdat,not_col_name):
    tr_feat = pd.merge(trdat,featdat,on='duns_number',how='inner')
#     print(col_list)
    col_list = [ n for n in list(tr_feat.columns) if n not in not_col_name ] 
    trainX = tr_feat.loc[:,col_list].to_numpy()
    trainY = tr_feat[['atlas_location_uuid','longitude_loc','latitude_loc']].to_numpy()
    
    tt_feat = pd.merge(ttdat,featdat,on='duns_number',how='inner')
    col_list = [ n for n in list(tt_feat.columns) if n not in not_col_name ] 
#     print(col_list)
    testX = tt_feat.loc[:,col_list].to_numpy()
    testY = tt_feat[['atlas_location_uuid','longitude_loc','latitude_loc']].to_numpy()
    return trainX,trainY,testX,testY

def transpd2np(featdatC,featdatL,pairdat,cont_col_nameC,cont_col_nameL,not_feat_col):
    tr_feat = pd.merge(pairdat,featdatC,on='duns_number',how='inner')
    XCC = tr_feat.loc[:,cont_col_nameC].to_numpy()
    out_col = []
    out_col.extend(not_feat_col)
    out_col.extend(cont_col_nameC)
    dum_col_nameC = [col for col in list(tr_feat.columns) if col not in out_col]
    XDC = tr_feat.loc[:,dum_col_nameC].to_numpy()

    tr_feat = pd.merge(pairdat,featdatL,on='atlas_location_uuid',how='inner')
    XCL = tr_feat.loc[:,cont_col_nameL].to_numpy()
    out_col = []
    out_col.extend(not_feat_col)
    out_col.extend(cont_col_nameL)
    dum_col_nameL = [col for col in list(tr_feat.columns) if col not in out_col]
    XDL = tr_feat.loc[:,dum_col_nameL].to_numpy()

    Y = pairdat[['label']].to_numpy()
    return XCC,XDC,XCL,XDL,Y

def transpd2np_train_test(featdatC,featdatL,trdat,ttdat):
    not_feat_col = ['duns_number',
                     'atlas_location_uuid',
                     'longitude_loc',
                     'latitude_loc',
                     'label']
    cont_col_nameC = ['emp_here','emp_total','sales_volume_us','square_footage','emp_here_range']
    cont_col_nameL = ['score_predicted_eo','score_employer','num_emp_weworkcore','num_poi_weworkcore',
                     'pct_wwcore_employee','pct_wwcore_business','num_retail_stores','num_doctor_offices',
                     'num_eating_places','num_drinking_places','num_hotels','num_fitness_gyms',
                     'population_density','pct_female_population','median_age','income_per_capita',
                     'pct_masters_degree','walk_score','bike_score']
    trXCC,trXDC,trXCL,trXDL,trY = transpd2np(featdatC,featdatL,trdat,cont_col_nameC,cont_col_nameL,not_feat_col)
    ttXCC,ttXDC,ttXCL,ttXDL,ttY = transpd2np(featdatC,featdatL,ttdat,cont_col_nameC,cont_col_nameL,not_feat_col)
    
    trXC = np.concatenate([trXCC,trXCL],axis=1)
    trXD = np.concatenate([trXDC,trXDL],axis=1)
    ttXC = np.concatenate([ttXCC,ttXCL],axis=1)
    ttXD = np.concatenate([ttXDC,ttXDL],axis=1)
#     trXC = 1.0*trXCC
#     trXD = 1.0*trXDC
#     ttXC = 1.0*ttXCC
#     ttXD = 1.0*ttXDC
    del trXCC,trXDC,trXCL,trXDL,ttXCC,ttXDC,ttXCL,ttXDL
    return trXC,trXD,ttXC,ttXD,trY,ttY

def transpdfeat_w_pair(featdat,pairdat,key_col,not_col_name):
    tr_feat = pd.merge(pairdat,featdat,on=key_col,how='inner').fillna(0)
    feat_col_name = [col for col in list(tr_feat.columns) if col not in not_col_name]
    X = tr_feat.loc[:,feat_col_name].to_numpy()
    return X

def normalize_dat_v2(trX,ttX,axis=0):
    center = trX.mean(axis=axis)
    center = np.expand_dims(center,axis)
    scale = trX.std(axis=axis)
    scale = np.expand_dims(scale,axis)
    
    trX = (trX-center)/scale
    ttX = (ttX-center)/scale
    return trX,ttX

def get_para_normalize_dat(trX,axis=0):
    center = trX.mean(axis=axis)
    scale = trX.std(axis=axis)
    scale += 1e-4
    return center,scale

def apply_para_normalize_dat(X,center,scale,axis=0):
    center = np.expand_dims(center,axis)
    scale = np.expand_dims(scale,axis)
    X = (X-center)/scale
    return X

def normalize_dat(trX,ttX,cols=5,axis=0):
    D = trX[:,:cols]
    center = D.mean(axis=axis)
    center = np.expand_dims(center,axis)
    scale = D.std(axis=axis)
    scale = np.expand_dims(scale,axis)
    
    trX[:,:cols] = (D-center)/scale
    ttX[:,:cols] = (ttX[:,:cols]-center)/scale
    
def calc_topk_acc_v2(QRscore,y_truth_cat,R_cat,k=3):
    """
    QRscore: similarity score matrix shape [Q,R]
    y_truth: index(related with R) of truth label of Query
    """
    y_truth_cat = y_truth_cat.reshape(-1,1)
    max_k_preds = QRscore.argsort(axis=1)[:, -k:][:, ::-1] #得到top-k max label
    max_k_cat = R_cat[max_k_preds]
    match_array = np.logical_or.reduce(max_k_cat==y_truth_cat, axis=1) #得到匹配结果
    topk_acc_score = match_array.sum() / match_array.shape[0]
    return topk_acc_score

def calc_topk_acc_cat_all(QRscore,y_truth_cat,R_cat,k=3):
    """
    QRscore: similarity score matrix shape [Q,R]
    y_truth: index(related with R) of truth label of Query
    return top1-topk acc
    """
    res = []
    y_truth_cat = y_truth_cat.reshape(-1,1)
    max_k_preds = QRscore.argsort(axis=1)[:, -k:][:, ::-1] #得到top-k max label
    max_k_cat = R_cat[max_k_preds]
    M = max_k_cat==y_truth_cat
    for k in range(M.shape[1]):
        match_array = np.logical_or.reduce(M[:,:k+1], axis=1) #得到匹配结果
        topk_acc_score = match_array.sum() / match_array.shape[0]
        res.append(topk_acc_score)
    return res

def transpd2np_single(featdatC,featdatL,trdat):
    not_feat_col = ['duns_number',
                     'atlas_location_uuid',
                     'longitude_loc',
                     'latitude_loc',
                     'label']
    cont_col_nameC = ['emp_here','emp_total','sales_volume_us','square_footage','emp_here_range']
    cont_col_nameL = ['score_predicted_eo','score_employer','num_emp_weworkcore','num_poi_weworkcore',
                     'pct_wwcore_employee','pct_wwcore_business','num_retail_stores','num_doctor_offices',
                     'num_eating_places','num_drinking_places','num_hotels','num_fitness_gyms',
                     'population_density','pct_female_population','median_age','income_per_capita',
                     'pct_masters_degree','walk_score','bike_score']
    trXCC,trXDC,trXCL,trXDL,trY = transpd2np(featdatC,featdatL,trdat,cont_col_nameC,cont_col_nameL,not_feat_col)
    
    trXC = np.concatenate([trXCC,trXCL],axis=1)
    trXD = np.concatenate([trXDC,trXDL],axis=1)
    del trXCC,trXDC,trXCL,trXDL
    return trXC,trXD,trY

def onehot2cat(x):
    """
    x: each row is a sample
    """
    return [np.where(r==1)[0][0] for r in x]

def get_loc_feat_by_comp(proc_comp_dat,pair_dat):
    tr_feat = pd.merge(pair_dat[['atlas_location_uuid','duns_number']],proc_comp_dat,on='duns_number',how='inner')
#     tr_feat = tr_feat.fillna(0)
    tr_feat = tr_feat.groupby(['atlas_location_uuid']).mean().drop(columns=['duns_number'])
    return tr_feat

import pickle
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
##Multi training data generator(multi city)
#如果不合并所有数据在进行dummy 会出现一些category在某些城市不出现的情况，从而导致问题
#8-2分训练测试集

def transpd2np_single(featdat,cont_col_name:list,not_feat_col:list,id_col_name:list):
    XC = featdat.loc[:,cont_col_name].to_numpy()
    out_col = not_feat_col+cont_col_name
    dum_col_name = [col for col in list(featdat.columns) if col not in out_col]
    XD = featdat.loc[:,dum_col_name].to_numpy()
    Y = featdat[id_col_name].to_numpy()
    return XC,XD,Y,cont_col_name,dum_col_name,id_col_name

datapath = '/Users/yefeichen/Database/location_recommender_system/'
cfile = ['dnb_pa.csv','dnb_sf.csv','dnb_sj.csv','dnb_Los_Angeles.csv','dnb_New_York.csv']
lfile = 'location_scorecard_191113.csv'
clfile = ['PA_191113.csv','SF_191113.csv','SJ_191113.csv','LA_191113.csv','NY_191113.csv']

not_feat_col = ['duns_number',
                 'atlas_location_uuid',
                 'longitude_loc',
                 'latitude_loc',
                 'label']
cont_col_nameC = ['emp_here','emp_total','sales_volume_us','square_footage','emp_here_range']
cont_col_nameL = ['score_predicted_eo','score_employer','num_emp_weworkcore','num_poi_weworkcore',
                 'pct_wwcore_employee','pct_wwcore_business','num_retail_stores','num_doctor_offices',
                 'num_eating_places','num_drinking_places','num_hotels','num_fitness_gyms',
                 'population_density','pct_female_population','median_age','income_per_capita',
                 'pct_masters_degree','walk_score','bike_score']
key_col_comp = ['duns_number']
key_col_loc = ['atlas_location_uuid']

ind_city = 0

train_test_val_pairs = []
dat_comp_pds = []
dat_loc_pds = []

pdlls = [] #all location feat pd list
pdccs = []
for ind_city in range(5):
    pdc = pd.read_csv(pjoin(datapath,cfile[ind_city]))
    pdl = pd.read_csv(pjoin(datapath,lfile))
    pdcl = pd.read_csv(pjoin(datapath,clfile[ind_city]))
    
    print('generating train_val_test csv')
    #train_test_val_pairs :[ duns_number, atlas_location_uuid, label, city, fold ]
    pair_dat = getPosNegdat(pdcl)
    tr,tt = splitdat(pair_dat,key_column=['duns_number','atlas_location_uuid'],right_colunm='label_tr',rate_tr=0.8)
    #training pair ==> pair format with positive only
    train_pos_pair = tr[tr['label']==1].groupby(['duns_number','atlas_location_uuid','label']).first().reset_index()[['duns_number','atlas_location_uuid','label']]
    #testing pair ==> pair format with positive and negative both
    testing_pair = tt.reset_index()[['duns_number','atlas_location_uuid','label']]
    
    train_pos_pair['fold'] = 0
    testing_pair['fold'] = 2

    train_test_val_pair = pd.concat([train_pos_pair,testing_pair])
    train_test_val_pair['city'] = ind_city
    train_test_val_pairs.append(train_test_val_pair)
    print(len(train_test_val_pair))
    print('train_val_test_location_company Done')
    
    #building features
    col_list = list(pdl.columns)
    pdll = pdl.merge(pdcl,how='inner',on=['atlas_location_uuid'],suffixes=['','_right'])
    pdll = pdll[pdll['duns_number'].isnull()==False]
    pdll = pdll.groupby(['atlas_location_uuid']).first().reset_index()
    pdll = pdll[col_list]
    pdlls.append(pdll)
    
    #company feature
    pdccs.append(pdc)
    
#for loop end
pdlls = pd.concat(pdlls,axis=0)
pdccs = pd.concat(pdccs,axis=0)

    
#building feature
pdlls = pdlls.reset_index()
proc_pdl = location_dat_process(pdlls)

#company feature
pdccs = pdccs.reset_index()
proc_pdc = comp_dat_process(pdccs)
print(len(proc_pdc))
    

print('start saving company and location feature...')

XC_comp,XD_comp,Y_comp,c_comp_name,d_comp_name,y_comp_name = transpd2np_single(proc_pdc,cont_col_nameC,not_feat_col,id_col_name=key_col_comp)
XC_loc,XD_loc,Y_loc,c_loc_name,d_loc_name,y_loc_name = transpd2np_single(proc_pdl['data'],cont_col_nameL,not_feat_col,id_col_name=key_col_loc)

C_comp,S_comp = get_para_normalize_dat(XC_comp)
C_loc,S_loc = get_para_normalize_dat(XC_loc)
XC_comp = apply_para_normalize_dat(XC_comp,C_comp,S_comp)
XC_loc = apply_para_normalize_dat(XC_loc,C_loc,S_loc)
    
X_comp = np.concatenate([Y_comp,XC_comp,XD_comp],axis=1)
X_loc = np.concatenate([Y_loc,XC_loc,XD_loc],axis=1)

comp_norm_param = {
    'C_comp':C_comp,
    'S_comp':S_comp,
    'columns':c_comp_name
}

loc_norm_param = {
    'C_loc':C_loc,
    'S_loc':S_loc,
    'columns':c_loc_name
}

save_obj(comp_norm_param,'comp_feat_norm_param3')
save_obj(loc_norm_param,'loc_feat_norm_param3')

dat_comp_pd = pd.DataFrame(data=X_comp,columns=y_comp_name+c_comp_name+d_comp_name)
dat_loc_pd = pd.DataFrame(data=X_loc,columns=y_loc_name+c_loc_name+d_loc_name)

    
print(dat_comp_pd.to_numpy().mean())
print(dat_loc_pd.to_numpy()[:,1:].mean())
print(dat_comp_pd.shape)

print('Done')

# print('Final merge...')
train_test_val_pair = pd.concat(train_test_val_pairs)

train_test_val_pair.to_csv('train_val_test_location_company_82split_191113.csv')
dat_comp_pd.to_csv('company_feat3.csv')
dat_loc_pd.to_csv('location_feat3.csv')
print('All Done')

  interactivity=interactivity, compiler=compiler, result=result)


generating train_val_test csv
6181
Neg dat num: 12362 ;Pos dat num: 6219
Train dat: 14865 Test dat: 3702
8704
train_val_test_location_company Done
generating train_val_test csv
56367
Neg dat num: 112734 ;Pos dat num: 56490
Train dat: 135379 Test dat: 33797
79030
train_val_test_location_company Done
generating train_val_test csv
24947
Neg dat num: 49894 ;Pos dat num: 25040
Train dat: 59947 Test dat: 14960
35016
train_val_test_location_company Done
generating train_val_test csv
90584
Neg dat num: 181168 ;Pos dat num: 90786
Train dat: 217563 Test dat: 54308
127091
train_val_test_location_company Done
generating train_val_test csv
106667
Neg dat num: 213334 ;Pos dat num: 106780
Train dat: 256091 Test dat: 63970
149228
train_val_test_location_company Done
doing one-hot...
5863
extract continuous...
5863
5863
doing one-hot...
extract continuous...
specific feature
418148
start saving company and location feature...
2229211.9267450343
0.04347826086956536
(418148, 103)
Done
All Done


In [7]:
dat_comp_pd.shape,dat_loc_pd.shape

((418148, 103), (5863, 24))