-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_salesforce.py
108 lines (82 loc) · 3.38 KB
/
script_salesforce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os,sys
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)
import pandas as pd
import argparse
from dnb.header import *
sfx = ['','_right']
pj = os.path.join
cid = 'duns_number'
bid = 'atlas_location_uuid'
fid = 'account_id'
fname = 'account_orig_name'
fname_formal = 'account_name'
city = 'physical_city'
def load_salesforce_dnb_match(db='' ,table='relation_dnb_account_0120.csv'):
db_path = pj(datapath, db)
sfdnb = pd.read_csv(pj(db_path, table), index_col=0)
bid = 'atlas_location_uuid'
cid = 'duns_number'
fid = 'account_id'
city = 'physical_city'
dnb_city = sfdnb.groupby([cid ,city]).first().reset_index()[[cid ,city]].rename(columns={city :'city'})
print( '%d dnb_city generated' %len(dnb_city))
dnb_city.to_csv(pj(datapath_mid ,salesforce_dnb_file))#[cid,city]
return sfdnb
if __name__ == '__main__':
parser = argparse.ArgumentParser()
arg = parser.add_argument
arg('--dnb_acc', default='')
arg('--dbname',default='reason_table')
args = parser.parse_args()
if args.dnb_acc:
dnb_acc = args.dnb_acc
else:
dnb_acc = salesforce_dnb_match_file
print('Using dnb_acc:%s'%dnb_acc)
sfdnb = load_salesforce_dnb_match(db='',table=dnb_acc)
dedup_sfdnb = sfdnb.drop_duplicates([fid,cid,city], keep='first').reset_index()[[fid,fname,cid,city]]
# assert(len(dedup_sfdnb.loc[dedup_sfdnb[cid]==74157331]) > 0)
dedup_sfdnb = dedup_sfdnb.rename(columns={fname:fname_formal})
print('Duplicate Shrinkage: %1.2f'% (len(dedup_sfdnb)/len(sfdnb)))
total = len(dedup_sfdnb)
sfdnb_lst = []
nosfdnb_lst = []
for ind_city,cur_city_name in enumerate(citylongname):
"""
Here filter is a must. Because not all the duns_number is valid. Need to check why?
"""
print('## %s ##'%cur_city_name )
comp_file = cfile[ind_city]
comp_dat = pd.read_csv( pj(datapath,comp_file))[[cid,city,'physical_zip_all','msa','latitude','longitude','business_name']]
comp_dat['state'] = comp_dat['msa'].apply(lambda x: str(x).replace(' ','').split(',')[-1] )
tmp = dedup_sfdnb.merge(comp_dat, on=[cid, city], suffixes=sfx)
sfdnb_lst.append( tmp )
print('%d'%len(tmp))
tmp2 = comp_dat.merge(dedup_sfdnb,on=[cid,city],suffixes=sfx,how='left')
tmp2 = tmp2.loc[lambda df: df[fid].isnull()]
nosfdnb_lst.append(tmp2)
sfdnb_lst = pd.concat(sfdnb_lst,axis=0)
nosfdnb_lst = pd.concat(nosfdnb_lst, axis=0)
print( '%d of %d covered'% (len(sfdnb_lst),total))
print('%d of %d not covered' % (len(nosfdnb_lst), total))
sfdnb_lst = sfdnb_lst.rename(columns={
'physical_zip_all':'zip_code',
city:'city',
'business_name':'company_name',
fid:'sfdc_account_id',
})
sfdnb_lst = sfdnb_lst.drop_duplicates(['sfdc_account_id', cid, 'city'], keep='first').reset_index()
print('Second Shrinkage: %d'%len(sfdnb_lst))
sfdnb_lst.to_csv(pj(datapath_mid,salesforce_dnb_info_file))
nosfdnb_lst = nosfdnb_lst.rename(columns={
'physical_zip_all': 'zip_code',
city: 'city',
'business_name': 'company_name',
fid: 'sfdc_account_id',
})
nosfdnb_lst[city] = nosfdnb_lst['city']
print(' %d saving'%len(nosfdnb_lst))
nosfdnb_lst.to_csv(pj(datapath_mid,no_salesforce_dnb_info_file))
print('Done')