# Address Parser

Is able to recognize named entities and format outputs based on a defined schema

In [6]:
GCP_PROJECT_NAME = "single-azimuth-413609"

In [None]:
!pip install --quiet --upgrade google-cloud-aiplatform 
!pip install --quiet --upgrade google-auth


In [None]:
!gcloud auth application-default login --scopes=openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/drive

In [None]:
!gcloud config set project {GCP_PROJECT_NAME}

In [None]:
!gcloud services enable aiplatform.googleapis.com

In [7]:
# Importing the library 
import google.auth

# Setting up credentials
credentials, _ = google.auth.default()
# authed_http = google.auth.transport.requests.AuthorizedSession(credentials)

In [None]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel

vertexai.init(project=GCP_PROJECT_NAME, location="us-central1")
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 126,
    "temperature": 0,
    "top_p": 1
}
model = TextGenerationModel.from_pretrained("text-bison")


## Basic french address

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 1, rue de la Paix, 75008 Paris
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

### Unusual French address

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 78 bis Cavee des Ecameaux 76500 Elbeuf
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 78 bis Cavee des Ecameaux 76500 Elbeuf
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

## Complex US address

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: C/o John Doe LLC, 111, 8th Ave Ste 1509, Oklahoma, 74136-1922, USA
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

## German address 

generator at: https://tarjeta-credito.net/cvv/address-gen-germany/

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: Haßfurter Str. 1, 91056 Erlangen, Germany.
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

### german address with Country not mentionned

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city and country
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: Auschnippe 9, 37170 Uslar
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

## Confidence level w/ False address

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: 
{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: 
{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: ici c'est paris !, 75001 Paris
JSON:

""",
    **parameters
)
print(f"{response.text}")

### Confidence level w/ False address and Country not mentioned

In [None]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: 
{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: 
{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 452 quater grande rue general 85000 le palais vendee
JSON:

""",
    **parameters
)
print(f"{response.text}")


# Programmatic

Using Langchain we can have a more programmatic way 

In [None]:
!pip install --upgrade --quiet  langchain-core langchain-google-vertexai

In [None]:
from langchain_google_vertexai import VertexAI

# candidate count parameter missing ?
model = VertexAI(model_name="text-bison", top_p=1, temperature=0, max_output_tokens=126)

In [23]:

from langchain_core.pydantic_v1 import BaseModel
from typing import Optional

from langchain_core.output_parsers import JsonOutputParser


class Address(BaseModel):
    street_number: Optional[str]
    street_multiplier: Optional[str]
    street_type: Optional[str]
    street_name: Optional[str]
    locality: Optional[str]  #lieu-dit
    zip_code: Optional[str]
    city: Optional[str]
    country: Optional[str]
    confidence: int


parser = JsonOutputParser(pydantic_object=Address)
# pprint(parser.get_format_instructions())


In [None]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts import PromptTemplate

examples = [
    {
        "address": "37, rue du champ du pardon, 76000 rouen",
        "answer": """{{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}}"""
    },
    {
        "address": "10-82 Cavée St Gervais, hameau de charette 76000 rouen",
        "answer": """{{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"locality\": \"hameau de charette\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}}"""
    },
]

prefix = """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input"""

example_prompt = PromptTemplate(
    template="""Text: {address}\nJSON: {answer}""",
    # template="""Answer the user query.\n{format_instructions}\nText: {address}\nJSON: {answer}\n""",
    input_variables=["address", "answer"],
    # partial_variables={"format_instructions": parser.get_format_instructions()}
)

# print(example_prompt.format(**examples[0]))

prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix="Text: {address}\nJSON:",
    input_variables=["address"],

)

# print(prompt.format(address="78 bis Cavee des Ecameaux 76500 Elbeuf"))


chain = prompt | model | parser

## Manual Testing

In [None]:

address = "1 bis rue de la Paix, 75008 Paris"
Address(**chain.invoke({"address": address}))


In [None]:

address = "Auschnippe 9, 37170 Uslar"
Address(**chain.invoke({"address": address}))


In [None]:

address = "C/o John Doe LLC, 111, 8th Ave Ste 1509, Oklahoma, 74136-1922, USA"
add = Address(**chain.invoke({"address": address}))
add


### Serialize / Compress

In [5]:
import sys
from pprint import pprint
import zlib

# size of object in bytes
pprint(sys.getsizeof(add))
pprint(sys.getsizeof(add.json()))
pprint(sys.getsizeof(zlib.compress(add.json().encode())))

NameError: name 'add' is not defined

## Auto-Testing

pre-requisite: Install dependencies: pandas, pyarrow, etc...

Use of 'base nationale des Adresses' for French addresses (BAN)

#### Exploratory Data Analysis & Cleanup

In [35]:
import pandas as pd
from pprint import pprint
import numpy as np

df_addresses = pd.read_parquet('./dataset/adresses-france-cleaned.parquet')


In [36]:

df_addresses.shape

(3393705, 7)

In [37]:
df_addresses.head(100)

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,01001_4b50r5_00630,630,,la Chèvre,01400,L'Abergement-Clémenciat,
1,01001_g0ru02_00108,108,,Clemencia,01400,L'Abergement-Clémenciat,
2,01001_ngzlqw_00009,9,,Imp des Epis,01400,L'Abergement-Clémenciat,
12,01001_0165_00019,19,,Route de la Fontaine,01400,L'Abergement-Clémenciat,
43,01001_0165_00080_bis,80,bis,Route de la Fontaine,01400,L'Abergement-Clémenciat,
...,...,...,...,...,...,...,...
576,01002_0140_00008,8,,Chemin du Lavoir,01640,L'Abergement-de-Varey,
586,01002_0095_00037,37,,Ruelle de la Flotiere,01640,L'Abergement-de-Varey,
591,01002_0295_00020,20,,Route des Vignes,01640,L'Abergement-de-Varey,
598,01002_0145_00002,2,,Chemin Louis Lumiere,01640,L'Abergement-de-Varey,


In [38]:
def dataframe_row_to_pydantic_object(row: pd.Series) -> Address:
    """
    Converts a row from a Pandas DataFrame to a Pydantic object.
    Args:
        row (pd.Series): A single row from the DataFrame.
    Returns:
        SomeModel: A Pydantic object with the data from the row.
    """
    #missing street_type
    return Address(street_number=str(row['numero']), street_multiplier=row['rep'], street_name=row['nom_voie'],
                   zip_code=str(row['code_postal']), city=row['nom_commune'], confidence=1)


In [39]:
dataframe_row_to_pydantic_object(df_addresses.iloc[0])

Address(street_number='630', street_multiplier='', street_type=None, street_name='la Chèvre', locality=None, zip_code='01400', city="L'Abergement-Clémenciat", country=None, confidence=1)

In [40]:
def full_address_generator(row: pd.Series) -> str:
    """
    Generates a one line address based on all the available fields from BAN
    :param row: pd.Series of one address from BAN
    :return: 
    """
    if row['nom_ld'] != '':
        row['nom_ld'] = ', ' + row['nom_ld'] #todo here we are cheating a little bit, in order to differentiate the locality (lieu-dit) we insert a comma, actually otherwise even for a human it would be very difficult to differentiate ! 
    strings = [str(row['numero']), str(row['rep']), str(row['nom_voie']), str(row['nom_ld']), str(row['code_postal']),
               str(row['nom_commune'])]
    return ' '.join(filter(None, strings))
    # return f"{row['numero']}{('',' '+str(row['rep']))[row['rep'] == 'nan']}{' ' + row['nom_voie']}{('', ', '+str(row['nom_ld']))[row['nom_ld'] == 'nan']}{' '+str(row['code_postal'])}{' ' + row['nom_commune']}"

In [41]:

index = 4568
pprint(df_addresses.iloc[index])
full_address_generator(df_addresses.iloc[index])

id               01408_0146_00091
numero                         91
rep                              
nom_voie         Chemin sur Roche
code_postal                 01250
nom_commune    Simandre-sur-Suran
nom_ld                           
Name: 34823, dtype: object


'91 Chemin sur Roche 01250 Simandre-sur-Suran'

Add one line address to Dataframe

In [42]:
df_addresses['full_add'] = df_addresses.apply(lambda _row: full_address_generator(_row), axis=1)

In [45]:
df_addresses.head()

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld,full_add
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,01001_4b50r5_00630,630,,la Chèvre,1400,L'Abergement-Clémenciat,,630 la Chèvre 01400 L'Abergement-Clémenciat
1,01001_g0ru02_00108,108,,Clemencia,1400,L'Abergement-Clémenciat,,108 Clemencia 01400 L'Abergement-Clémenciat
2,01001_ngzlqw_00009,9,,Imp des Epis,1400,L'Abergement-Clémenciat,,9 Imp des Epis 01400 L'Abergement-Clémenciat
12,01001_0165_00019,19,,Route de la Fontaine,1400,L'Abergement-Clémenciat,,19 Route de la Fontaine 01400 L'Abergement-Clé...
43,01001_0165_00080_bis,80,bis,Route de la Fontaine,1400,L'Abergement-Clémenciat,,80 bis Route de la Fontaine 01400 L'Abergement...


### Scoring on large dataset

In [46]:
def score_ban_from_split(split: Address, ban: pd.Series, _print: bool = False) -> tuple[int, int]:
    """
    Defines a score to compare an Address object and a pd.Series of BAN
    :param split: Address object
    :param ban: pd.Series row of BAN Dataframe
    :param _print: print the input objects (debug purposes)
    :return: score from 0 to 6 and _score one hot encoded from 0 to 111111
    """
    if _print: 
        print(split)
        print(ban)
    _score = 0
    if split.street_number == str(ban['numero']): _score += 1
    if (not (bool(split.street_multiplier) | bool(ban['rep']))) | (
            split.street_multiplier == str(ban['rep'])): _score += 10
    if (split.street_type + ' ' + split.street_name) == str(ban['nom_voie']): _score += 100
    if (not (bool(split.locality) | bool(ban['nom_ld']))) | (split.locality == ban['nom_ld']): _score += 1000
    if split.zip_code == str(ban['code_postal']): _score += 10000
    if split.city == str(ban['nom_commune']): _score += 100000
    return str(_score).count('1'), _score


In [47]:
row = df_addresses.iloc[6892]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

NameError: name 'chain' is not defined

In [None]:
row = df_addresses.iloc[4564]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[5521]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[11123]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[5546]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, 1)
score

#### Score Compute

We need to take a portion of the data set in order to minimize the cost
we expect the 'score' column to contain the maximum possible returned e.g. (6, 111111)

In [32]:
df_addresses.shape

(3393705, 11)

In [48]:
df_test = df_addresses.sample(200).copy()
df_test.shape

(200, 8)

In [49]:
df_test.head(200)

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld,full_add
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
54684,84007_6630_00011_ter,11,ter,Pl des Trois-Pilats,84000,Avignon,,11 ter Pl des Trois-Pilats 84000 Avignon
368587,33473_58bn1m_00001_bis,1,bis,Drouin,33920,Saint-Savin,,1 bis Drouin 33920 Saint-Savin
196009,27008_0050_00001,1,,Rue des Sablons,27460,Alizay,,1 Rue des Sablons 27460 Alizay
245047,36145_3oge9x_00001,1,,La Blanchetterie,36290,Obterre,,1 La Blanchetterie 36290 Obterre
244272,77201_0060_00014_bis,14,bis,Place de l’Eglise,77370,Gastins,,14 bis Place de l’Eglise 77370 Gastins
...,...,...,...,...,...,...,...,...
424360,59523_nj44rs_00001,1,,Residence des 4 Cantons,59262,Sainghin-en-Mélantois,,1 Residence des 4 Cantons 59262 Sainghin-en-Mé...
454815,61460_0033_00001,1,,Rue Jean Moulin,61470,Sap-en-Auge,le Sap,"1 Rue Jean Moulin , le Sap 61470 Sap-en-Auge"
359666,06152_0295_00020_a,20,a,Avenue Saint Roch,06560,Valbonne,,20 a Avenue Saint Roch 06560 Valbonne
276719,31557742_00004_a,4,a,Allée des Frères Higouneng,31170,Tournefeuille,,4 a Allée des Frères Higouneng 31170 Tournefeu...


In [None]:
def score_row(_row : pd.Series) -> tuple[int, int]:
    _split_adr = Address(**chain.invoke({"address": row['full_add']}))
    return score_ban_from_split(_split_adr, row)

In [None]:
df_test['score'] = df_test.apply(lambda _row: score_row(_row), axis=1)

In [None]:
df_test.head(100)