Following section reads monthly foot traffic data based on the locations saved in poi_sod.csv.gz, and saves read data into filtered_ft_data.csv

In [5]:
import requests
import pandas as pd
import sys
from io import BytesIO
import gzip
import io
import os
from global_variables import *

In [6]:
poi_sod_file = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_sod.csv.gz" 
poi_sod = pd.read_csv(poi_sod_file, compression='gzip')

In [3]:
results = requests.get(url=MONTHLY_FT_PATTERNS_PRODUCT_API_PATH,
                       params={
                               'partition_key_after': '2019-11-01',   # optionally set date value here
                               'partition_key_before': '2022-12-01'}, # optionally set date value here
                       headers={'X-API-KEY': API_KEY,
                                'accept': 'application/json'
                               })
response_json = results.json()

https://community.deweydata.io/t/bulk-downloading-data-using-v3-api-using-python/26533
https://github.com/amplifydata/amplifydata-public/blob/main/README.md

In [7]:
ft_data_path = 'C:/Users/dratnadiwakara2/Downloads/temp_dewey/filtered_ft_data.csv'
page = 1
download_count = 0
while True:
    print(page)
    results = requests.get(url=MONTHLY_FT_PATTERNS_PRODUCT_API_PATH,
                       params={'page': page,
                               'partition_key_after': '2019-11-01',   # optionally set date value here
                               'partition_key_before': '2021-12-01'}, # optionally set date value here
                       headers={'X-API-KEY': API_KEY,
                                'accept': 'application/json'
                               })
    response_json = results.json()

    for link_data in response_json['download_links']:
        print(f"Downloading file {link_data['file_name']}...")
    
        data = requests.get(link_data['link'])
        gzip_stream = io.BytesIO(data.content)
    
        df = pd.read_csv(gzip_stream, compression='gzip', low_memory=False)
        filtered_df = df[df['PLACEKEY'].isin(poi_sod['PLACEKEY'])]
    
        if os.path.isfile(ft_data_path):
            header_option = False  
        else:
            header_option = True  
    
        filtered_df.to_csv(ft_data_path, mode='a', header=header_option, index=False)
        
        download_count += 1
        
    total_pages = response_json['total_pages']
    if page >= total_pages:
        break
    page += 1   

1
Downloading file Monthly_Patterns_Foot_Traffic-0-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-1-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-2-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-3-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-4-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-5-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-6-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-7-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-8-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-9-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_Traffic-10-DATE_RANGE_START-2019-11-01.csv.gz...
Downloading file Monthly_Patterns_Foot_T

In [17]:
ft_data = pd.read_csv(ft_data_path)
ft_data = ft_data.dropna(subset=['VISITOR_HOME_CBGS'])
ft_data = ft_data.reset_index()
print(ft_data['PLACEKEY'].nunique())

119641


In [None]:
output_file_path = 'C:/Users/dratnadiwakara2/Downloads/temp_dewey/filtered_ft_data.csv'+'.gz'
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    ft_data.to_csv(gzipped_file, index=False)

Following section reads cleaned poi and sod data and matches the two datasets

In [1]:
import time
from tqdm import tqdm
import requests
import pandas as pd
import os
from io import BytesIO
import gzip
from global_variables import *

In [2]:
# next section has the code to create these files
poi_file = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_with_usps_address.csv.gz" 
sod_file = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/sod_data_branches_with_usps_address.csv.gz" 

In [3]:
poi = pd.read_csv(poi_file, compression='gzip')
sod = pd.read_csv(sod_file, compression='gzip')

In [7]:
poi = poi.drop_duplicates()

In [12]:
sod = sod.drop_duplicates(subset='usps_address', keep='first')


In [15]:
poi_sod = poi.merge(sod[['usps_address', 'NAMEBR', 'NAMEFULL', 'SIMS_LATITUDE', 'SIMS_LONGITUDE', 'UNINUMBR', 'CERT']], on='usps_address', how='inner')

In [16]:
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_sod.csv.gz" 
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    poi_sod.to_csv(gzipped_file, index=False)




The section below cleans poi and sod data and convert the address in both the files to USPS format

In [None]:
import time
from tqdm import tqdm
import requests
import pandas as pd
import os
from io import BytesIO
import gzip
from global_variables import *
import urllib.request
import xml.etree.ElementTree as ET

In [2]:
# this function uses USPS api to get the verified USPS address
def get_usps_address(street,city,state,zip):
    xml_string = (
        '<?xml version="1.0"?>\n'
        '<AddressValidateRequest USERID="33LOUIS8M0561">\n'
        '    <Revision>1</Revision>\n'
        '    <Address ID="0">\n'
        f'        <Address1>{street}</Address1>\n'
        '        <Address2></Address2>\n'
        f'        <City>{city}</City>\n'
        f'        <State>{state}</State>\n'
        f'        <Zip5>{zip}</Zip5>\n'
        '        <Zip4/>\n'
        '    </Address>\n'
        '</AddressValidateRequest>'
    )
    xml_string = xml_string.replace('\n','').replace('\t','')
    xml_string = urllib.parse.quote_plus(xml_string)
    xml_string = "http://production.shippingapis.com/ShippingAPI.dll?API=Verify&XML=" + xml_string
    response = urllib.request.urlopen(xml_string)
    if response.getcode() == 200:
        contents = response.read()
        return contents
    else:
        return 'error'

In [3]:
results = requests.get(url=POI_Data_PRODUCT_API_PATH,
                       params={}, # optionally set date value here
                       headers={'X-API-KEY': API_KEY,
                                'accept': 'application/json'
                               })
response_json = results.json()

## Read Data

The following script reads data, filters US POI, and select important columns, and saves output as a csv file.

In [7]:
poi_csv_path = 'C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_us_banks.csv'

In [None]:
# for link_data in response_json['download_links']:
# 
#     print(f"Downloading file {link_data['file_name']}...")
# 
#     df = pd.read_csv(BytesIO(requests.get(link_data['link']).content), compression="gzip")
#     df = df[df['ISO_COUNTRY_CODE'] == 'US'][['CATEGORY_TAGS', 'CITY', 'LATITUDE','LONGITUDE','NAICS_CODE','PLACEKEY','POSTAL_CODE','REGION','STREET_ADDRESS','SUB_CATEGORY','TOP_CATEGORY','LOCATION_NAME']]
#     df = df[(df['TOP_CATEGORY'] == 'Depository Credit Intermediation') | (df['NAICS_CODE'].astype(str).str[:3] == '522')]
#     df = df.reset_index()
# 
#     if os.path.isfile(poi_csv_path):
#         header_option = False  
#     else:
#         header_option = True  
# 
#     df.to_csv(poi_csv_path, mode='a', header=header_option, index=False)
# 
# df = pd.read_csv(poi_csv_path)    
# 
# with gzip.open(poi_csv_path+".gz", 'wt', encoding='utf-8') as gzipped_file:
#     df.to_csv(gzipped_file, index=False)

In [9]:
with gzip.open(poi_csv_path+".gz", 'rt', encoding='utf-8') as gzipped_file:
    poi = pd.read_csv(gzipped_file)

poi = poi.reset_index()
poi['POSTAL_CODE'] = poi['POSTAL_CODE'].astype(int)
poi['full_address_poi'] = poi['STREET_ADDRESS'].str.lower()+", "+poi['CITY'].str.lower()+", "+poi['POSTAL_CODE'].astype(str)+" "+ poi['REGION'].str.lower()

In [14]:
poi['usps_contents'] = ''
poi  = poi.reset_index(drop=True)

output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_full_address.csv"  

In [19]:
with tqdm(total=(len(poi)-28871) ) as pbar:
    with open(output_file_path, "a") as output_file:
        for index in range(28871, len(poi)):
            row = poi.iloc[index]
            num_rows = len(poi)
            break_interval = 400
            
            if (index + 1) % break_interval == 0 and index != (num_rows - 1):
                time.sleep(1)  # Sleep for one second
                
            try:
                op = get_usps_address(row['STREET_ADDRESS'],row['CITY'],row['REGION'],row['POSTAL_CODE'])
                poi.loc[index,'usps_contents'] = op
                output_file.write(f"{row['full_address_poi']}|{index}|{op}\n")
            except Exception as e:
                print(f"An exception occurred: {e}")
                break
            pbar.update(1)

100%|██████████| 532164/532164 [28:10:35<00:00,  5.25it/s]    


In [22]:
poi['usps_address'] = ''
with tqdm(total=len(poi) ) as pbar:
    for index, row in poi.iterrows():
        root = ET.fromstring(poi.loc[index,'usps_contents'])
        if len(root.findall("Address"))==1:
            try:
                add_xml  =  root.findall("Address")[0]
                add_text = add_xml.find("Address2").text+" "+add_xml.find("City").text+" "+add_xml.find("State").text+" "+add_xml.find("Zip5").text
                if add_xml.find("Address1") is not None:
                    add_text = add_text+" "+add_xml.find("Address1").text
                poi.loc[index,'usps_address'] = add_text
            except:
                pass
        pbar.update(1)      

100%|██████████| 561035/561035 [02:25<00:00, 3851.05it/s]


In [33]:
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_with_usps_address.csv.gz" 
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    poi.to_csv(gzipped_file, index=False)



## SOD Data

In [67]:
sod_files_path = 'C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/SOD/data/'

sod_data = ['ALL_2019.csv','ALL_2020.csv','ALL_2021.csv','ALL_2022.csv','ALL_2023.csv']

sod_data_files = [sod_files_path + s for s in sod_data]

selected_columns = ['YEAR','CERT','BRNUM','UNINUMBR','NAMEFULL','ADDRESBR','CITYBR',	'CNTYNAMB','STALPBR','ZIPBR','NAMEBR','SIMS_LATITUDE','SIMS_LONGITUDE']  

combined_df = pd.DataFrame(columns=selected_columns)

for csv_file in sod_data_files:
    df = pd.read_csv(csv_file, encoding='latin-1')
    selected_df = df[selected_columns]
    combined_df = pd.concat([combined_df, selected_df], ignore_index=True)

combined_df = combined_df.reset_index()

  combined_df = pd.concat([combined_df, selected_df], ignore_index=True)


In [154]:
sod_data_branches = combined_df.drop_duplicates(subset=['UNINUMBR'])
sod_data_branches = sod_data_branches.reset_index()

In [155]:
sod_data_branches['full_address_sod'] = sod_data_branches['ADDRESBR'].str.lower()+", "+sod_data_branches['CITYBR'].str.lower()+", "+sod_data_branches['ZIPBR'].astype(str)+" "+sod_data_branches['STALPBR'].str.lower()

In [156]:
sod_data_branches['full_address_sod'][0]

'1 lincoln st. fl 1, boston, 2111 ma'

In [169]:
sod_data_branches['usps_contents'] = ''
sod_data_branches  = sod_data_branches.reset_index(drop=True)

output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/sod_full_address.csv"  

with tqdm(total=len(sod_data_branches) ) as pbar:
    with open(output_file_path, "a") as output_file:
        for index, row in sod_data_branches.iterrows():
            num_rows = len(sod_data_branches)
            break_interval = 400
            
            if (index + 1) % break_interval == 0 and index != (num_rows - 1):
                time.sleep(1)  # Sleep for one second
                
            try:
                op = get_usps_address(row['ADDRESBR'],row['CITYBR'],row['STALPBR'],row['ZIPBR'])
                sod_data_branches.loc[index,'usps_contents'] = op
                output_file.write(f"{row['full_address_sod']}|{index}|{op}\n")
            except:
                print('error')
                break
            pbar.update(1)

100%|██████████| 90924/90924 [4:57:49<00:00,  5.09it/s]   


In [188]:
sod_data_branches['usps_address'] = ''
with tqdm(total=len(sod_data_branches) ) as pbar:
    for index, row in sod_data_branches.iterrows():
        root = ET.fromstring(sod_data_branches.loc[index,'usps_contents'])
        if len(root.findall("Address"))==1:
            try:
                add_xml  =  root.findall("Address")[0]
                add_text = add_xml.find("Address2").text+" "+add_xml.find("City").text+" "+add_xml.find("State").text+" "+add_xml.find("Zip5").text
                if add_xml.find("Address1") is not None:
                    add_text = add_text+" "+add_xml.find("Address1").text
                sod_data_branches.loc[index,'usps_address'] = add_text
            except:
                pass
        pbar.update(1)        

100%|██████████| 90924/90924 [00:20<00:00, 4428.05it/s]


In [192]:
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/sod_data_branches_with_usps_address.csv.gz" 
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    sod_data_branches.to_csv(gzipped_file, index=False)







# Ignore below this
## Match by longitude and latitude

In [35]:
df1 = sod_data_branches[['full_address_sod', 'match_string_2']]
df2 = poi[['full_address_poi', 'match_string_2']]

# Merge df1 and df2 on 'match_string_2' to create matched_data_2
matched_data_2 = pd.merge(df1, df2, on='match_string_2', how='inner')



## Fuzzy Match

In [24]:
from fuzzywuzzy import fuzz
from tqdm import tqdm
from fuzzywuzzy import process



In [25]:
def address_similarity(address1, address2):
    return fuzz.ratio(address1.lower(), address2.lower())

In [27]:
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/bank_branch_matches.csv"  

with tqdm(total=len(sod_data_branches) ) as pbar:
    with open(output_file_path, "a") as output_file:
        for index1, row1 in sod_data_branches.iterrows():
            mask = (poi['match_string'].str.lower() == row1['match_string'].lower())
            temp = poi[mask]
            for index2, row2 in temp.iterrows():
                try:
                        similarity_score = address_similarity(row1['full_address_sod'], row2['full_address_poi'])
                        if similarity_score >= 50: 
                            output_file.write(f"{row1['full_address_sod']}|{row2['full_address_poi']}|{similarity_score}\n")
                except:
                    pass
            pbar.update(1)

100%|██████████| 90924/90924 [16:49:50<00:00,  1.50it/s]  


In [36]:
matched_data = pd.read_csv(output_file_path,sep="|",header=None)
matched_data.rename(columns={0:'full_address_sod',1:'full_address_poi',2:'similarity_score'}, inplace=True)
matched_data = matched_data.drop_duplicates()
matched_data = matched_data.sort_values(by='similarity_score', ascending=False)
matched_data = matched_data[(matched_data['similarity_score'] >= 90) & (matched_data['full_address_sod'].str[:2] == matched_data['full_address_poi'].str[:2])]
matched_data = matched_data.drop_duplicates(subset='full_address_sod')


In [54]:
poi_matched = poi[poi['full_address_poi'].isin(matched_data['full_address_poi'])]

In [56]:
poi_category_count = poi_matched.groupby('NAICS_CODE').size().reset_index(name='Frequency')
# TOP_CATEGORY: Depository Credit Intermediation

In [50]:
requestXML = """
<?xml version="1.0"?>
<AddressValidateRequest USERID="33LOUIS8M0561">
	<Revision>1</Revision>
	<Address ID="0">
		<Address1>2335 S State, ste 300</Address1>
		<Address2></Address2>
		<City>Provo</City>
		<State>UT</State>
		<Zip5>84604</Zip5>
		<Zip4/>
	</Address>
</AddressValidateRequest>
"""

In [53]:
docString = requestXML
docString = docString.replace('\n','').replace('\t','')
docString = urllib.parse.quote_plus(docString)

url = "http://production.shippingapis.com/ShippingAPI.dll?API=Verify&XML=" + docString
#print(url + "\n\n")

response = urllib.request.urlopen(url)
if response.getcode() != 200:
	print("Error making HTTP call:")
	print(response.info())
	exit()

contents = response.read()

root = ET.fromstring(contents)
for address in root.findall('Address'):
	print()
	print("Address1: " + address.find("Address1").text)
	print("Address2: " + address.find("Address2").text)
	print("City:	 " + address.find("City").text)
	print("State:	" + address.find("State").text)
	print("Zip5:	 " + address.find("Zip5").text)


Address1: STE 300
Address2: 2335 S STATE ST
City:	 PROVO
State:	UT
Zip5:	 84606


In [None]:
def create_xml_string(row):
    xml_string = (
        '<?xml version="1.0"?>\n'
        '<AddressValidateRequest USERID="33LOUIS8M0561">\n'
        '    <Revision>1</Revision>\n'
        '    <Address ID="0">\n'
        f'        <Address1>{row["address"]}</Address1>\n'
        '        <Address2></Address2>\n'
        f'        <City>{row["city"]}</City>\n'
        f'        <State>{row["state"]}</State>\n'
        f'        <Zip5>{row["zip"]}</Zip5>\n'
        '        <Zip4/>\n'
        '    </Address>\n'
        '</AddressValidateRequest>'
    )
    return xml_string

# Create a new column 'xml_string' using the function
df['xml_string'] = df.apply(create_xml_string, axis=1)