In [1]:
# for manipulating dataframes
import pandas as pd

import usaddress
from fuzzywuzzy import fuzz

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# read csv file
df_active = pd.read_csv('../data/out/filtered_los_angeles_900.csv')

In [3]:
def parse_and_standardize_address(address):
    try:
        parsed_address = usaddress.tag(address)
        address_dict = dict(parsed_address[0])
        return address_dict
    except usaddress.RepeatedLabelError:
        return None

In [4]:
# apply the function to parse and standardize addresses
df_active['StandardizedAddress'] = df_active['STREET ADDRESS'].apply(parse_and_standardize_address)

# expand the standardized address into separate columns
standardized_df = df_active['StandardizedAddress'].apply(pd.Series)

dfx_active = standardized_df[['AddressNumber', 'StreetName', 'StreetNamePostType', 'StreetNamePreDirectional', 'OccupancyType', 'OccupancyIdentifier', 'StreetNamePreType']]

In [5]:
# concatenate the original and standardized dataframes
result_active_df = pd.concat([df_active, dfx_active], axis=1)

# display the result
result_active_df

Unnamed: 0,LOCATION ACCOUNT #,BUSINESS NAME,DBA NAME,STREET ADDRESS,CITY,ZIP CODE,LOCATION DESCRIPTION,MAILING ADDRESS,MAILING CITY,MAILING ZIP CODE,NAICS,PRIMARY NAICS DESCRIPTION,COUNCIL DISTRICT,LOCATION START DATE,LOCATION END DATE,LOCATION,StandardizedAddress,AddressNumber,StreetName,StreetNamePostType,StreetNamePreDirectional,OccupancyType,OccupancyIdentifier,StreetNamePreType
0,0002829017-0001-5,RICHARD JOHN SHERMAN,,2010 LA BREA TERRACE,LOS ANGELES,90046-2314,2010 LA BREA 90046-2314,,,,,,4,04/19/2014,,,"{'AddressNumber': '2010', 'StreetName': 'LA BREA', 'StreetNamePostType': 'TERRACE'}",2010,LA BREA,TERRACE,,,,
1,0000111620-0001-4,SOUTHERN CALIFORNIA GRANTMAKERS,,1000 N ALAMEDA STREET SUITE #230,LOS ANGELES,90012-1804,1000 ALAMEDA 90012-1804,,,,,,14,07/01/1984,,"(34.0593, -118.2361)","{'AddressNumber': '1000', 'StreetNamePreDirectional': 'N', 'StreetName': 'ALAMEDA', 'StreetNamePostType': 'STREET', 'OccupancyType': 'SUITE', 'OccupancyIdentifier': '# 230'}",1000,ALAMEDA,STREET,N,SUITE,# 230,
2,0003293756-0001-5,BHI RESIDENTIAL LONG TERM CORPORATION,,732 S SPRING STREET APT #1021,LOS ANGELES,90014-3058,732 SPRING 90014-3058,,,,,,14,09/01/2021,,"(34.0435, -118.2527)","{'AddressNumber': '732', 'StreetNamePreDirectional': 'S', 'StreetName': 'SPRING', 'StreetNamePostType': 'STREET', 'OccupancyType': 'APT', 'OccupancyIdentifier': '# 1021'}",732,SPRING,STREET,S,APT,# 1021,
3,0002774873-0001-4,ISAIAH C. WILLIS III,,153 W 59TH STREET,LOS ANGELES,90003-1103,153 59TH 90003-1103,153 W 59TH STREET,LOS ANGELES,90003-1103,,,9,07/01/2014,,"(33.9869, -118.275)","{'AddressNumber': '153', 'StreetNamePreDirectional': 'W', 'StreetName': '59TH', 'StreetNamePostType': 'STREET'}",153,59TH,STREET,W,,,
4,0002862088-0001-0,ARTURO ALBERTO ALARCON RAMIREZ,,853 E 33RD STREET,LOS ANGELES,90011-2415,853 33RD 90011-2415,,,,,,9,01/01/2014,,,"{'AddressNumber': '853', 'StreetNamePreDirectional': 'E', 'StreetName': '33RD', 'StreetNamePostType': 'STREET'}",853,33RD,STREET,E,,,
5,0002038010-0001-9,SABURO SHIMONO,,1661 ANGELUS AVENUE,LOS ANGELES,90026-1412,1661 ANGELUS 90026-1412,,,,,,13,01/01/2001,,,"{'AddressNumber': '1661', 'StreetName': 'ANGELUS', 'StreetNamePostType': 'AVENUE'}",1661,ANGELUS,AVENUE,,,,
6,0002977480-0001-4,ALUSTRETCH LA LLC,,120 W AVENUE 34,LOS ANGELES,90031-1804,120 AVENUE 34 90031-1804,,,,,,1,03/01/2017,,"(34.0839, -118.2145)","{'AddressNumber': '120', 'StreetNamePreDirectional': 'W', 'StreetNamePreType': 'AVENUE', 'StreetName': '34'}",120,34,,W,,,AVENUE
7,0003209018-0001-9,EL SOL FOODS GROUP,,127 E AVENUE 42,LOS ANGELES,90031-1520,127 Avenue 42 90031-1520,,,,,,1,07/20/2017,,"(34.0935, -118.2081)","{'AddressNumber': '127', 'StreetNamePreDirectional': 'E', 'StreetNamePreType': 'AVENUE', 'StreetName': '42'}",127,42,,E,,,AVENUE
8,0002810621-0001-9,NANCY'S CLEANING SERVICES,,1742 W 64TH STREET,LOS ANGELES,90047-1945,1742 64TH 90047,1742 W 64TH STREET,LOS ANGELES,90047-1945,,,8,02/15/2015,,"(33.9814, -118.3084)","{'AddressNumber': '1742', 'StreetNamePreDirectional': 'W', 'StreetName': '64TH', 'StreetNamePostType': 'STREET'}",1742,64TH,STREET,W,,,
9,0002196419-0001-1,SAMUEL CHEW,,3701 OBAMA BLVD,LOS ANGELES,90016-4867,3701 OBAMA 90016-4867,,,,,,10,05/03/2005,,"(34.0216, -118.3366)","{'AddressNumber': '3701', 'StreetName': 'OBAMA', 'StreetNamePostType': 'BLVD'}",3701,OBAMA,BLVD,,,,


In [6]:
# read csv files
df_lei = pd.read_csv('../data/out/lei_records_los_angeles.csv')

In [7]:
# apply the function to parse and standardize addresses
df_lei['StandardizedAddress'] = df_lei['Address'].apply(parse_and_standardize_address)

# expand the standardized address into separate columns
standardized_df = df_lei['StandardizedAddress'].apply(pd.Series)

dfx_lei = standardized_df[['AddressNumber', 'StreetName', 'StreetNamePostType', 'StreetNamePreDirectional', 'OccupancyType', 'OccupancyIdentifier', 'StreetNamePreType']]

In [8]:
# Concatenate the original and standardized dataframes
result_lei_df = pd.concat([df_lei, dfx_lei], axis=1)

# Display the result
result_lei_df

Unnamed: 0,LEI,Address,StandardizedAddress,AddressNumber,StreetName,StreetNamePostType,StreetNamePreDirectional,OccupancyType,OccupancyIdentifier,StreetNamePreType
0,549300E7TO710PTXPJ65,"['200 North Main Street', 'Room 1500']","{'AddressNumber': '200', 'StreetNamePreDirectional': 'North', 'StreetName': 'Main', 'StreetNamePostType': 'Street'', 'OccupancyType': 'Room', 'OccupancyIdentifier': '1500']'}",200,Main,Street',North,Room,1500'],
1,Y5KB6ZGYM7NRD34XZ729,['977 N. Broadway'],"{'AddressNumber': '977', 'StreetNamePreDirectional': 'N.', 'StreetName': 'Broadway']'}",977,Broadway'],,N.,,,
2,549300DZCL1LRBNVU327,"['11150 Santa Monica Boulevard', 'Suite 200']","{'AddressNumber': '11150', 'StreetName': 'Santa Monica', 'StreetNamePostType': 'Boulevard'', 'OccupancyType': 'Suite', 'OccupancyIdentifier': '200']'}",11150,Santa Monica,Boulevard',,Suite,200'],
3,54930030X6ZC2PN42H14,"['500 West Temple Street, Room 432']","{'AddressNumber': '500', 'StreetNamePreDirectional': 'West', 'StreetName': 'Temple', 'StreetNamePostType': 'Street', 'OccupancyType': 'Room', 'OccupancyIdentifier': '432']'}",500,Temple,Street,West,Room,432'],
4,89450019KJ013KL4LO06,['944 AIROLE WAY'],"{'AddressNumber': '944', 'StreetName': 'AIROLE', 'StreetNamePostType': 'WAY']'}",944,AIROLE,WAY'],,,,
5,254900DZ31Y0FLS1PM48,['2721 Jalmia Drive'],"{'AddressNumber': '2721', 'StreetName': 'Jalmia', 'StreetNamePostType': 'Drive']'}",2721,Jalmia,Drive'],,,,
6,254900LV4ZPRTX071003,['616 Nimes Road'],"{'AddressNumber': '616', 'StreetName': 'Nimes', 'StreetNamePostType': 'Road']'}",616,Nimes,Road'],,,,
7,254900Y74YW9IEIU4E45,['550 N Larchmont'],"{'AddressNumber': '550', 'StreetNamePreDirectional': 'N', 'StreetName': 'Larchmont']'}",550,Larchmont'],,N,,,
8,549300AOB2UPZM3W5J86,['2108 Stratford Circle'],"{'AddressNumber': '2108', 'StreetName': 'Stratford', 'StreetNamePostType': 'Circle']'}",2108,Stratford,Circle'],,,,
9,549300KBB52D6WDE6I72,['3604 LANKERSHIM'],"{'AddressNumber': '3604', 'StreetName': 'LANKERSHIM']'}",3604,LANKERSHIM'],,,,,


In [9]:
# function to clean and standardize street addresses
def clean_street_address(address):
    if pd.isna(address):  # Check if the address is NaN
        return ''  # Return an empty string for NaN values
    # convert to lowercase and remove special characters
    cleaned_address = ''.join(e for e in str(address).lower() if e.isalnum() or e.isspace())
    return cleaned_address.strip()

In [10]:
# apply the function to clean and standardize addresses
columns_to_clean = ['AddressNumber', 'StreetName', 'StreetNamePostType', 'StreetNamePreDirectional', 'OccupancyIdentifier', 'StreetNamePreType']

for column in columns_to_clean:
    result_active_df[column] = result_active_df[column].apply(clean_street_address)

for column in columns_to_clean:
    result_lei_df[column] = result_lei_df[column].apply(clean_street_address)

In [11]:
def get_values(df, columns):
    """
    Take a dataframe and a list of columns and
    returns the value counts for the columns.
    """
    for column in columns:
        print(column)
        print('=====================================')
        print(df[column].value_counts(dropna=False))
        print('\n')

def show_values(df, param):
    if param == 'all':
        get_values(df, df.columns)
    else:
        get_values(df, param) 

In [12]:
show_values(result_active_df, ['StreetNamePostType', 'StreetNamePreDirectional'])

StreetNamePostType
StreetNamePostType
street     304
avenue     256
blvd       197
place       40
drive       37
            31
way         11
road         9
terrace      6
lane         4
circle       2
plaza        1
trail        1
park         1
Name: count, dtype: int64


StreetNamePreDirectional
StreetNamePreDirectional
     309
s    221
w    179
n    110
e     81
Name: count, dtype: int64




In [13]:
show_values(result_lei_df, ['StreetNamePostType', 'StreetNamePreDirectional'])

StreetNamePostType
StreetNamePostType
blvd         604
avenue       361
street       185
boulevard    175
             155
st            87
ave           71
drive         22
way            8
road           8
place          6
dr             6
center         6
rd             5
circle         5
park           4
court          2
lane           1
vista          1
cir            1
parkway        1
lvd            1
boulevar       1
pl             1
ln             1
bld            1
steet          1
Name: count, dtype: int64


StreetNamePreDirectional
StreetNamePreDirectional
         890
south    417
w        167
s        136
west      52
n         30
north     10
e         10
east       4
so         3
ave        1
Name: count, dtype: int64




In [14]:
# replace 'blvd' with 'boulevard' using word boundaries
result_active_df['StreetNamePostType'] = result_active_df['StreetNamePostType'].str.replace(r'\bblvd\b', 'boulevard', regex=True)

In [15]:
show_values(result_active_df, ['StreetNamePostType'])

StreetNamePostType
StreetNamePostType
street       304
avenue       256
boulevard    197
place         40
drive         37
              31
way           11
road           9
terrace        6
lane           4
circle         2
plaza          1
trail          1
park           1
Name: count, dtype: int64




In [16]:
show_values(result_active_df, ['StreetNamePreDirectional'])

StreetNamePreDirectional
StreetNamePreDirectional
     309
s    221
w    179
n    110
e     81
Name: count, dtype: int64




In [17]:
# replace pre-directionals
result_active_df['StreetNamePreDirectional'] = result_active_df['StreetNamePreDirectional'] \
                                                .str.replace(r'\bn\b', 'north', regex=True) \
                                                .str.replace(r'\be\b', 'east', regex=True) \
                                                .str.replace(r'\bw\b', 'west', regex=True) \
                                                .str.replace(r'\bs\b', 'south', regex=True)

In [18]:
show_values(result_active_df, ['StreetNamePreDirectional'])

StreetNamePreDirectional
StreetNamePreDirectional
         309
south    221
west     179
north    110
east      81
Name: count, dtype: int64




In [19]:
show_values(result_lei_df, ['StreetNamePostType'])

StreetNamePostType
StreetNamePostType
blvd         604
avenue       361
street       185
boulevard    175
             155
st            87
ave           71
drive         22
way            8
road           8
place          6
dr             6
center         6
rd             5
circle         5
park           4
court          2
lane           1
vista          1
cir            1
parkway        1
lvd            1
boulevar       1
pl             1
ln             1
bld            1
steet          1
Name: count, dtype: int64




In [20]:
# replace pre-directionals
result_lei_df['StreetNamePostType'] = result_lei_df['StreetNamePostType'] \
                                    .str.replace(r'\bblvd\b', 'boulevard', regex=True) \
                                    .str.replace(r'\bst\b', 'street', regex=True) \
                                    .str.replace(r'\bave\b', 'avenue', regex=True) \
                                    .str.replace(r'\bdr\b', 'drive', regex=True) \
                                    .str.replace(r'\brd\b', 'road', regex=True) \
                                    .str.replace(r'\bcir\b', 'circle', regex=True) \
                                    .str.replace(r'\blvd\b', 'boulevard', regex=True) \
                                    .str.replace(r'\bboulevar\b', 'boulevard', regex=True) \
                                    .str.replace(r'\bpl\b', 'place', regex=True) \
                                    .str.replace(r'\bln\b', 'lane', regex=True) \
                                    .str.replace(r'\bbld\b', 'boulevard', regex=True) \
                                    .str.replace(r'\bsteet\b', 'street', regex=True)

In [21]:
show_values(result_lei_df, ['StreetNamePostType'])

StreetNamePostType
StreetNamePostType
boulevard    782
avenue       432
street       273
             155
drive         28
road          13
way            8
place          7
circle         6
center         6
park           4
court          2
lane           2
vista          1
parkway        1
Name: count, dtype: int64




In [22]:
show_values(result_lei_df, ['StreetNamePreDirectional'])

StreetNamePreDirectional
StreetNamePreDirectional
         890
south    417
w        167
s        136
west      52
n         30
north     10
e         10
east       4
so         3
ave        1
Name: count, dtype: int64




In [23]:
# replace pre-directionals
result_lei_df['StreetNamePreDirectional'] = result_lei_df['StreetNamePreDirectional'] \
                                            .str.replace(r'\bn\b', 'north', regex=True) \
                                            .str.replace(r'\be\b', 'east', regex=True) \
                                            .str.replace(r'\bw\b', 'west', regex=True) \
                                            .str.replace(r'\bs\b', 'south', regex=True) \
                                            .str.replace(r'\bso\b', 'south', regex=True)

In [24]:
show_values(result_lei_df, ['StreetNamePreDirectional'])

StreetNamePreDirectional
StreetNamePreDirectional
         890
south    556
west     219
north     40
east      14
ave        1
Name: count, dtype: int64




In [25]:
result_active_df

Unnamed: 0,LOCATION ACCOUNT #,BUSINESS NAME,DBA NAME,STREET ADDRESS,CITY,ZIP CODE,LOCATION DESCRIPTION,MAILING ADDRESS,MAILING CITY,MAILING ZIP CODE,NAICS,PRIMARY NAICS DESCRIPTION,COUNCIL DISTRICT,LOCATION START DATE,LOCATION END DATE,LOCATION,StandardizedAddress,AddressNumber,StreetName,StreetNamePostType,StreetNamePreDirectional,OccupancyType,OccupancyIdentifier,StreetNamePreType
0,0002829017-0001-5,RICHARD JOHN SHERMAN,,2010 LA BREA TERRACE,LOS ANGELES,90046-2314,2010 LA BREA 90046-2314,,,,,,4,04/19/2014,,,"{'AddressNumber': '2010', 'StreetName': 'LA BREA', 'StreetNamePostType': 'TERRACE'}",2010,la brea,terrace,,,,
1,0000111620-0001-4,SOUTHERN CALIFORNIA GRANTMAKERS,,1000 N ALAMEDA STREET SUITE #230,LOS ANGELES,90012-1804,1000 ALAMEDA 90012-1804,,,,,,14,07/01/1984,,"(34.0593, -118.2361)","{'AddressNumber': '1000', 'StreetNamePreDirectional': 'N', 'StreetName': 'ALAMEDA', 'StreetNamePostType': 'STREET', 'OccupancyType': 'SUITE', 'OccupancyIdentifier': '# 230'}",1000,alameda,street,north,SUITE,230,
2,0003293756-0001-5,BHI RESIDENTIAL LONG TERM CORPORATION,,732 S SPRING STREET APT #1021,LOS ANGELES,90014-3058,732 SPRING 90014-3058,,,,,,14,09/01/2021,,"(34.0435, -118.2527)","{'AddressNumber': '732', 'StreetNamePreDirectional': 'S', 'StreetName': 'SPRING', 'StreetNamePostType': 'STREET', 'OccupancyType': 'APT', 'OccupancyIdentifier': '# 1021'}",732,spring,street,south,APT,1021,
3,0002774873-0001-4,ISAIAH C. WILLIS III,,153 W 59TH STREET,LOS ANGELES,90003-1103,153 59TH 90003-1103,153 W 59TH STREET,LOS ANGELES,90003-1103,,,9,07/01/2014,,"(33.9869, -118.275)","{'AddressNumber': '153', 'StreetNamePreDirectional': 'W', 'StreetName': '59TH', 'StreetNamePostType': 'STREET'}",153,59th,street,west,,,
4,0002862088-0001-0,ARTURO ALBERTO ALARCON RAMIREZ,,853 E 33RD STREET,LOS ANGELES,90011-2415,853 33RD 90011-2415,,,,,,9,01/01/2014,,,"{'AddressNumber': '853', 'StreetNamePreDirectional': 'E', 'StreetName': '33RD', 'StreetNamePostType': 'STREET'}",853,33rd,street,east,,,
5,0002038010-0001-9,SABURO SHIMONO,,1661 ANGELUS AVENUE,LOS ANGELES,90026-1412,1661 ANGELUS 90026-1412,,,,,,13,01/01/2001,,,"{'AddressNumber': '1661', 'StreetName': 'ANGELUS', 'StreetNamePostType': 'AVENUE'}",1661,angelus,avenue,,,,
6,0002977480-0001-4,ALUSTRETCH LA LLC,,120 W AVENUE 34,LOS ANGELES,90031-1804,120 AVENUE 34 90031-1804,,,,,,1,03/01/2017,,"(34.0839, -118.2145)","{'AddressNumber': '120', 'StreetNamePreDirectional': 'W', 'StreetNamePreType': 'AVENUE', 'StreetName': '34'}",120,34,,west,,,avenue
7,0003209018-0001-9,EL SOL FOODS GROUP,,127 E AVENUE 42,LOS ANGELES,90031-1520,127 Avenue 42 90031-1520,,,,,,1,07/20/2017,,"(34.0935, -118.2081)","{'AddressNumber': '127', 'StreetNamePreDirectional': 'E', 'StreetNamePreType': 'AVENUE', 'StreetName': '42'}",127,42,,east,,,avenue
8,0002810621-0001-9,NANCY'S CLEANING SERVICES,,1742 W 64TH STREET,LOS ANGELES,90047-1945,1742 64TH 90047,1742 W 64TH STREET,LOS ANGELES,90047-1945,,,8,02/15/2015,,"(33.9814, -118.3084)","{'AddressNumber': '1742', 'StreetNamePreDirectional': 'W', 'StreetName': '64TH', 'StreetNamePostType': 'STREET'}",1742,64th,street,west,,,
9,0002196419-0001-1,SAMUEL CHEW,,3701 OBAMA BLVD,LOS ANGELES,90016-4867,3701 OBAMA 90016-4867,,,,,,10,05/03/2005,,"(34.0216, -118.3366)","{'AddressNumber': '3701', 'StreetName': 'OBAMA', 'StreetNamePostType': 'BLVD'}",3701,obama,boulevard,,,,


In [26]:
result_lei_df

Unnamed: 0,LEI,Address,StandardizedAddress,AddressNumber,StreetName,StreetNamePostType,StreetNamePreDirectional,OccupancyType,OccupancyIdentifier,StreetNamePreType
0,549300E7TO710PTXPJ65,"['200 North Main Street', 'Room 1500']","{'AddressNumber': '200', 'StreetNamePreDirectional': 'North', 'StreetName': 'Main', 'StreetNamePostType': 'Street'', 'OccupancyType': 'Room', 'OccupancyIdentifier': '1500']'}",200,main,street,north,Room,1500,
1,Y5KB6ZGYM7NRD34XZ729,['977 N. Broadway'],"{'AddressNumber': '977', 'StreetNamePreDirectional': 'N.', 'StreetName': 'Broadway']'}",977,broadway,,north,,,
2,549300DZCL1LRBNVU327,"['11150 Santa Monica Boulevard', 'Suite 200']","{'AddressNumber': '11150', 'StreetName': 'Santa Monica', 'StreetNamePostType': 'Boulevard'', 'OccupancyType': 'Suite', 'OccupancyIdentifier': '200']'}",11150,santa monica,boulevard,,Suite,200,
3,54930030X6ZC2PN42H14,"['500 West Temple Street, Room 432']","{'AddressNumber': '500', 'StreetNamePreDirectional': 'West', 'StreetName': 'Temple', 'StreetNamePostType': 'Street', 'OccupancyType': 'Room', 'OccupancyIdentifier': '432']'}",500,temple,street,west,Room,432,
4,89450019KJ013KL4LO06,['944 AIROLE WAY'],"{'AddressNumber': '944', 'StreetName': 'AIROLE', 'StreetNamePostType': 'WAY']'}",944,airole,way,,,,
5,254900DZ31Y0FLS1PM48,['2721 Jalmia Drive'],"{'AddressNumber': '2721', 'StreetName': 'Jalmia', 'StreetNamePostType': 'Drive']'}",2721,jalmia,drive,,,,
6,254900LV4ZPRTX071003,['616 Nimes Road'],"{'AddressNumber': '616', 'StreetName': 'Nimes', 'StreetNamePostType': 'Road']'}",616,nimes,road,,,,
7,254900Y74YW9IEIU4E45,['550 N Larchmont'],"{'AddressNumber': '550', 'StreetNamePreDirectional': 'N', 'StreetName': 'Larchmont']'}",550,larchmont,,north,,,
8,549300AOB2UPZM3W5J86,['2108 Stratford Circle'],"{'AddressNumber': '2108', 'StreetName': 'Stratford', 'StreetNamePostType': 'Circle']'}",2108,stratford,circle,,,,
9,549300KBB52D6WDE6I72,['3604 LANKERSHIM'],"{'AddressNumber': '3604', 'StreetName': 'LANKERSHIM']'}",3604,lankershim,,,,,


In [27]:
# concatenate columns A, B, and C into a new column X
result_active_df['X'] = result_active_df.apply(lambda row: ' '.join(row[columns_to_clean]), axis=1)

# concatenate columns A, B, and C into a new column X
result_lei_df['X'] = result_lei_df.apply(lambda row: ' '.join(row[columns_to_clean]), axis=1)

In [28]:
# initialize an empty DataFrame for storing matches
matches = pd.DataFrame(columns=["STREET", "ADDRESS", "SCORE", "LEI"])

for x in result_active_df['X']:
    
    # initialize an empty DataFrame for results
    dfx = pd.DataFrame(columns=["from_active", "from_lei", "score", "lei"])

    # iterate over rows in df_lei to compare addresses
    for index, row in result_lei_df.iterrows():
        xx = row['X']
        score = fuzz.ratio(x, xx)

        # check if similarity score is above the threshold
        if score > 90:
            # create a new DataFrame for the matched row
            new_row = pd.DataFrame([{
                "from_active": x,
                "from_lei": xx,
                "score": score,
                "lei": row['LEI']  # Access 'LEI' value directly from df_lei based on the current row
            }])
            # concatenate the new row to dfx
            dfx = pd.concat([dfx, new_row], ignore_index=True)

    # check if dfx is not empty and sort by 'Similarity Score' in descending order
    if not dfx.empty:
        dfx = dfx.sort_values(by='score', ascending=False)
        best_match = dfx.iloc[0]
        new_row = pd.DataFrame([{
            "STREET": best_match['from_active'],
            "ADDRESS": best_match['from_lei'],
            "SCORE": best_match['score'],
            "LEI": best_match['lei']
        }])
        matches = pd.concat([matches, new_row], ignore_index=True)

matches

Unnamed: 0,STREET,ADDRESS,SCORE,LEI
0,1000 alameda street north 230,1000 alameda street north,95,54930046FICHFANLFV69
1,3700 wilshire boulevard 630,3700 wilshire boulevard 330,97,549300DCGBXW5FJMV921
2,10918 figueroa street south,601 figueroa street south,93,5493007K6HGI9G0ECX16
3,5101 sunset boulevard west,9130 sunset boulevard west,93,254900U2FFXHLFM5EC50
4,12560 washington boulevard west,1667 washington boulevard west,92,2549007LW1QMWRBJJ212
5,155 barrington place south,125 barrington place south,96,254900R39MHIMV8AL811
6,610 main street south 411,610 main street south 344,92,984500B61B615CC73D25
7,9081 pico boulevard west,2818 pico boulevard west,92,98450054AF98A72B0F02
8,10635 santa monica boulevard 200,10635 santa monica boulevard 180,94,2549005N9FEI9UN51789
9,3600 wilshire boulevard 100 m,600 wilshire boulevard 500,92,254900UY19K9G9JP9T24


In [29]:
matches['STREET'].iloc[6]
matches['ADDRESS'].iloc[6]

'610 main street south 411 '

'610 main street south 344 '

In [30]:
# replace "of the stars avenue" with "avenue of the stars"
matches['STREET'] = matches['STREET'].str.replace('2000 of the stars    avenue', '2000 avenue of the stars')
matches['ADDRESS'] = matches['ADDRESS'].str.replace('2000 of the stars    avenue', '2000 avenue of the stars')

In [31]:
matches

Unnamed: 0,STREET,ADDRESS,SCORE,LEI
0,1000 alameda street north 230,1000 alameda street north,95,54930046FICHFANLFV69
1,3700 wilshire boulevard 630,3700 wilshire boulevard 330,97,549300DCGBXW5FJMV921
2,10918 figueroa street south,601 figueroa street south,93,5493007K6HGI9G0ECX16
3,5101 sunset boulevard west,9130 sunset boulevard west,93,254900U2FFXHLFM5EC50
4,12560 washington boulevard west,1667 washington boulevard west,92,2549007LW1QMWRBJJ212
5,155 barrington place south,125 barrington place south,96,254900R39MHIMV8AL811
6,610 main street south 411,610 main street south 344,92,984500B61B615CC73D25
7,9081 pico boulevard west,2818 pico boulevard west,92,98450054AF98A72B0F02
8,10635 santa monica boulevard 200,10635 santa monica boulevard 180,94,2549005N9FEI9UN51789
9,3600 wilshire boulevard 100 m,600 wilshire boulevard 500,92,254900UY19K9G9JP9T24
