In [8]:
# for manipulating dataframes
import pandas as pd

import usaddress
import requests
import xml.etree.ElementTree as ET

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# read csv files
df_active = pd.read_csv('../data/out/filtered_los_angeles_100.csv')
df_lei = pd.read_csv('../data/out/lei_records_los_angeles.csv')

In [3]:
df_active.head()

Unnamed: 0,LOCATION ACCOUNT #,BUSINESS NAME,DBA NAME,STREET ADDRESS,CITY,ZIP CODE,LOCATION DESCRIPTION,MAILING ADDRESS,MAILING CITY,MAILING ZIP CODE,NAICS,PRIMARY NAICS DESCRIPTION,COUNCIL DISTRICT,LOCATION START DATE,LOCATION END DATE,LOCATION
0,0002829017-0001-5,RICHARD JOHN SHERMAN,,2010 LA BREA TERRACE,LOS ANGELES,90046-2314,2010 LA BREA 90046-2314,,,,,,4,04/19/2014,,
1,0000111620-0001-4,SOUTHERN CALIFORNIA GRANTMAKERS,,1000 N ALAMEDA STREET SUITE #230,LOS ANGELES,90012-1804,1000 ALAMEDA 90012-1804,,,,,,14,07/01/1984,,"(34.0593, -118.2361)"
2,0003293756-0001-5,BHI RESIDENTIAL LONG TERM CORPORATION,,732 S SPRING STREET APT #1021,LOS ANGELES,90014-3058,732 SPRING 90014-3058,,,,,,14,09/01/2021,,"(34.0435, -118.2527)"
3,0002774873-0001-4,ISAIAH C. WILLIS III,,153 W 59TH STREET,LOS ANGELES,90003-1103,153 59TH 90003-1103,153 W 59TH STREET,LOS ANGELES,90003-1103,,,9,07/01/2014,,"(33.9869, -118.275)"
4,0002862088-0001-0,ARTURO ALBERTO ALARCON RAMIREZ,,853 E 33RD STREET,LOS ANGELES,90011-2415,853 33RD 90011-2415,,,,,,9,01/01/2014,,


In [22]:
df_active = df_active.head(3)

In [12]:
# Your USPS API credentials
USPS_USER_ID = '234T2BRAVE141'

In [28]:
# Function to standardize an address using the USPS API
def standardize_address(address1, city, state, zip5):
    url = 'https://stg-secure.shippingapis.com/shippingapi.dll'
    payload = {
        'API': 'Verify',
        'XML': f"""
            <AddressValidateRequest USERID="{USPS_USER_ID}">
                <Address>
                    <Address1>{address1}</Address1>
                    <City>{city}</City>
                    <State>{state}</State>
                    <Zip5>{zip5}</Zip5>
                    <Zip4></Zip4>
                </Address>
            </AddressValidateRequest>
        """
    }

    print(f"Sending request to USPS API with payload: {payload['XML']}")
    
    response = requests.get(url, params=payload)
    
    if response.status_code == 200:
        print(f"USPS API Response: {response.content}")
        tree = ET.ElementTree(ET.fromstring(response.content))
        root = tree.getroot()
        address = root.find('Address')
        if address is not None:
            standardized_address = {
                'Address2': address.find('Address2').text if address.find('Address2') is not None else '',
                'City': address.find('City').text if address.find('City') is not None else '',
                'State': address.find('State').text if address.find('State') is not None else '',
                'Zip5': address.find('Zip5').text if address.find('Zip5') is not None else '',
                'Zip4': address.find('Zip4').text if address.find('Zip4') is not None else ''
            }
            return standardized_address
    print(f"Failed to standardize address: {address1}, {city}, {state}, {zip5}")
    return None

# Function to parse an address using usaddress and standardize using USPS API
def parse_and_standardize_address(row):
    try:
        # Combine the address components into a single string and convert to lower case
        street_address = f"{row['STREET ADDRESS']} {row['CITY']} {row['ZIP CODE']}".lower()
        print(f"Parsing address: {street_address}")
        parsed_address = usaddress.tag(street_address)
        if parsed_address[1] == 'Ambiguous':
            print(f"Ambiguous address: {street_address}")
            return None

        address_dict = parsed_address[0]
        
        # Extract address components
        address1 = f"{address_dict.get('AddressNumber', '')} {address_dict.get('StreetName', '')} {address_dict.get('StreetNamePostType', '')}".strip()
        city = row['CITY']
        state = address_dict.get('StateName', '') if 'StateName' in address_dict else ''
        zip5 = row['ZIP CODE']
        
        # Standardize the address using USPS API
        return standardize_address(address1, city, state, zip5)
    except usaddress.RepeatedLabelError as e:
        print(f"Error parsing address: {street_address} - {e}")
        return None

# # Sample data
# data = {
#     'STREET ADDRESS': [
#         '123 Main St',
#         '456 Elm St Apt 2B',
#         '789 Oak St'
#     ],
#     'CITY': ['Springfield', 'Springfield', 'Springfield'],
#     'ZIP CODE': ['62701', '62702', '62703']
# }

# # Create a DataFrame
# df_active = pd.DataFrame(data).copy()  # Make a copy of the DataFrame

# Apply the parse_and_standardize_address function to each row
df_active['StandardizedAddress'] = df_active.apply(parse_and_standardize_address, axis=1)

# Expand the standardized address into separate columns
standardized_df = df_active['StandardizedAddress'].apply(pd.Series)

# Concatenate the original and standardized dataframes
result_df = pd.concat([df_active, standardized_df], axis=1)

# Drop the temporary 'StandardizedAddress' column
result_df.drop(columns=['StandardizedAddress'], inplace=True)

# Display the result
result_df

Parsing address: 2010 la brea terrace los angeles 90046-2314
Sending request to USPS API with payload: 
            <AddressValidateRequest USERID="234T2BRAVE141">
                <Address>
                    <Address1>2010 la brea terrace</Address1>
                    <City>LOS ANGELES</City>
                    <State>angeles</State>
                    <Zip5>90046-2314</Zip5>
                    <Zip4></Zip4>
                </Address>
            </AddressValidateRequest>
        
USPS API Response: b'<?xml version="1.0" encoding="UTF-8"?>\n<Error><Number>80040B1A</Number><Description>Authorization failure.  Perhaps username and/or password is incorrect.</Description><Source>USPSCOM::DoAuth</Source></Error>'
Failed to standardize address: 2010 la brea terrace, LOS ANGELES, angeles, 90046-2314
Parsing address: 1000 n alameda street suite #230 los angeles 90012-1804
Sending request to USPS API with payload: 
            <AddressValidateRequest USERID="234T2BRAVE141">
              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_active['StandardizedAddress'] = df_active.apply(parse_and_standardize_address, axis=1)


Unnamed: 0,LOCATION ACCOUNT #,BUSINESS NAME,DBA NAME,STREET ADDRESS,CITY,ZIP CODE,LOCATION DESCRIPTION,MAILING ADDRESS,MAILING CITY,MAILING ZIP CODE,NAICS,PRIMARY NAICS DESCRIPTION,COUNCIL DISTRICT,LOCATION START DATE,LOCATION END DATE,LOCATION
0,0002829017-0001-5,RICHARD JOHN SHERMAN,,2010 LA BREA TERRACE,LOS ANGELES,90046-2314,2010 LA BREA 90046-2314,,,,,,4,04/19/2014,,
1,0000111620-0001-4,SOUTHERN CALIFORNIA GRANTMAKERS,,1000 N ALAMEDA STREET SUITE #230,LOS ANGELES,90012-1804,1000 ALAMEDA 90012-1804,,,,,,14,07/01/1984,,"(34.0593, -118.2361)"
2,0003293756-0001-5,BHI RESIDENTIAL LONG TERM CORPORATION,,732 S SPRING STREET APT #1021,LOS ANGELES,90014-3058,732 SPRING 90014-3058,,,,,,14,09/01/2021,,"(34.0435, -118.2527)"
