In [None]:
import time
from tqdm import tqdm
import requests
import pandas as pd
import os
from io import BytesIO
import gzip
from global_variables import *
import urllib.request
import xml.etree.ElementTree as ET


# this function uses USPS api to get the verified USPS address
def get_usps_address(street, city, state, zip):
    xml_string = (
        '<?xml version="1.0"?>\n'
        '<AddressValidateRequest USERID="33LOUIS8M0561">\n'
        '    <Revision>1</Revision>\n'
        '    <Address ID="0">\n'
        f'        <Address1>{street}</Address1>\n'
        '        <Address2></Address2>\n'
        f'        <City>{city}</City>\n'
        f'        <State>{state}</State>\n'
        f'        <Zip5>{zip}</Zip5>\n'
        '        <Zip4/>\n'
        '    </Address>\n'
        '</AddressValidateRequest>'
    )
    xml_string = xml_string.replace('\n', '').replace('\t', '')
    xml_string = urllib.parse.quote_plus(xml_string)
    xml_string = "http://production.shippingapis.com/ShippingAPI.dll?API=Verify&XML=" + xml_string
    response = urllib.request.urlopen(xml_string)
    if response.getcode() == 200:
        contents = response.read()
        return contents
    else:
        return 'error'


results = requests.get(url=POI_Data_PRODUCT_API_PATH,
                       params={},  # optionally set date value here
                       headers={'X-API-KEY': API_KEY,
                                'accept': 'application/json'
                                })
response_json = results.json()
## Read Data


In [None]:

poi_csv_path = 'C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_us_banks.csv'
# for link_data in response_json['download_links']:
# 
#     print(f"Downloading file {link_data['file_name']}...")
# 
#     df = pd.read_csv(BytesIO(requests.get(link_data['link']).content), compression="gzip")
#     df = df[df['ISO_COUNTRY_CODE'] == 'US'][['CATEGORY_TAGS', 'CITY', 'LATITUDE','LONGITUDE','NAICS_CODE','PLACEKEY','POSTAL_CODE','REGION','STREET_ADDRESS','SUB_CATEGORY','TOP_CATEGORY','LOCATION_NAME']]
#     df = df[(df['TOP_CATEGORY'] == 'Depository Credit Intermediation') | (df['NAICS_CODE'].astype(str).str[:3] == '522')]
#     df = df.reset_index()
# 
#     if os.path.isfile(poi_csv_path):
#         header_option = False  
#     else:
#         header_option = True  
# 
#     df.to_csv(poi_csv_path, mode='a', header=header_option, index=False)
# 
# df = pd.read_csv(poi_csv_path)    
# 
# with gzip.open(poi_csv_path+".gz", 'wt', encoding='utf-8') as gzipped_file:
#     df.to_csv(gzipped_file, index=False)
with gzip.open(poi_csv_path + ".gz", 'rt', encoding='utf-8') as gzipped_file:
    poi = pd.read_csv(gzipped_file)

poi = poi.reset_index()
poi['POSTAL_CODE'] = poi['POSTAL_CODE'].astype(int)
poi['full_address_poi'] = poi['STREET_ADDRESS'].str.lower() + ", " + poi['CITY'].str.lower() + ", " + poi[
    'POSTAL_CODE'].astype(str) + " " + poi['REGION'].str.lower()
poi['usps_contents'] = ''
poi = poi.reset_index(drop=True)

output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_full_address.csv"
with tqdm(total=(len(poi) - 28871)) as pbar:
    with open(output_file_path, "a") as output_file:
        for index in range(28871, len(poi)):
            row = poi.iloc[index]
            num_rows = len(poi)
            break_interval = 400

            if (index + 1) % break_interval == 0 and index != (num_rows - 1):
                time.sleep(1)  # Sleep for one second

            try:
                op = get_usps_address(row['STREET_ADDRESS'], row['CITY'], row['REGION'], row['POSTAL_CODE'])
                poi.loc[index, 'usps_contents'] = op
                output_file.write(f"{row['full_address_poi']}|{index}|{op}\n")
            except Exception as e:
                print(f"An exception occurred: {e}")
                break
            pbar.update(1)
poi['usps_address'] = ''
with tqdm(total=len(poi)) as pbar:
    for index, row in poi.iterrows():
        root = ET.fromstring(poi.loc[index, 'usps_contents'])
        if len(root.findall("Address")) == 1:
            try:
                add_xml = root.findall("Address")[0]
                add_text = add_xml.find("Address2").text + " " + add_xml.find("City").text + " " + add_xml.find(
                    "State").text + " " + add_xml.find("Zip5").text
                if add_xml.find("Address1") is not None:
                    add_text = add_text + " " + add_xml.find("Address1").text
                poi.loc[index, 'usps_address'] = add_text
            except:
                pass
        pbar.update(1)
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/poi_with_usps_address.csv.gz"
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    poi.to_csv(gzipped_file, index=False)


In [None]:

## SOD Data
sod_files_path = 'C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/SOD/data/'

sod_data = ['ALL_2019.csv', 'ALL_2020.csv', 'ALL_2021.csv', 'ALL_2022.csv', 'ALL_2023.csv']

sod_data_files = [sod_files_path + s for s in sod_data]

selected_columns = ['YEAR', 'CERT', 'BRNUM', 'UNINUMBR', 'NAMEFULL', 'ADDRESBR', 'CITYBR', 'CNTYNAMB', 'STALPBR',
                    'ZIPBR', 'NAMEBR', 'SIMS_LATITUDE', 'SIMS_LONGITUDE']

combined_df = pd.DataFrame(columns=selected_columns)

for csv_file in sod_data_files:
    df = pd.read_csv(csv_file, encoding='latin-1')
    selected_df = df[selected_columns]
    combined_df = pd.concat([combined_df, selected_df], ignore_index=True)

combined_df = combined_df.reset_index()
sod_data_branches = combined_df.drop_duplicates(subset=['UNINUMBR'])
sod_data_branches = sod_data_branches.reset_index()
sod_data_branches['full_address_sod'] = sod_data_branches['ADDRESBR'].str.lower() + ", " + sod_data_branches[
    'CITYBR'].str.lower() + ", " + sod_data_branches['ZIPBR'].astype(str) + " " + sod_data_branches[
                                            'STALPBR'].str.lower()



In [None]:
sod_data_branches['usps_contents'] = ''
sod_data_branches = sod_data_branches.reset_index(drop=True)

output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/sod_full_address.csv"

with tqdm(total=len(sod_data_branches)) as pbar:
    with open(output_file_path, "a") as output_file:
        for index, row in sod_data_branches.iterrows():
            num_rows = len(sod_data_branches)
            break_interval = 400

            if (index + 1) % break_interval == 0 and index != (num_rows - 1):
                time.sleep(1)  # Sleep for one second

            try:
                op = get_usps_address(row['ADDRESBR'], row['CITYBR'], row['STALPBR'], row['ZIPBR'])
                sod_data_branches.loc[index, 'usps_contents'] = op
                output_file.write(f"{row['full_address_sod']}|{index}|{op}\n")
            except:
                print('error')
                break
            pbar.update(1)
sod_data_branches['usps_address'] = ''
with tqdm(total=len(sod_data_branches)) as pbar:
    for index, row in sod_data_branches.iterrows():
        root = ET.fromstring(sod_data_branches.loc[index, 'usps_contents'])
        if len(root.findall("Address")) == 1:
            try:
                add_xml = root.findall("Address")[0]
                add_text = add_xml.find("Address2").text + " " + add_xml.find("City").text + " " + add_xml.find(
                    "State").text + " " + add_xml.find("Zip5").text
                if add_xml.find("Address1") is not None:
                    add_text = add_text + " " + add_xml.find("Address1").text
                sod_data_branches.loc[index, 'usps_address'] = add_text
            except:
                pass
        pbar.update(1)
        
output_file_path = "C:/Users/dratnadiwakara2/Documents/OneDrive - Louisiana State University/Raw Data/Dewey/sod_data_branches_with_usps_address.csv.gz"
with gzip.open(output_file_path, 'wt', encoding='utf-8') as gzipped_file:
    sod_data_branches.to_csv(gzipped_file, index=False)