# Address Parser

Is able to recognize named entities and format outputs based on a defined schema

In [2]:
GCP_PROJECT_NAME = "single-azimuth-413609"

In [3]:
!pip install --quiet --upgrade google-cloud-aiplatform 
!pip install --quiet --upgrade google-auth


In [4]:
!gcloud auth application-default login --scopes=openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/drive

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=ohnZjOP1pHG2kIxpwyDENK8XTXZ9QC&access_type=offline&code_challenge=ZHNCIx6oCKxTn6lwiocCauzz-wUpTSl6dyzBE1dLyiI&code_challenge_method=S256


Credentials saved to file: [/Users/vincentjuge/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "single-azimuth-413609" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project own

In [5]:
!gcloud config set project {GCP_PROJECT_NAME}

Updated property [core/project].


In [6]:
!gcloud services enable aiplatform.googleapis.com

In [7]:
# Importing the library 
import google.auth

# Setting up credentials
credentials, _ = google.auth.default()
# authed_http = google.auth.transport.requests.AuthorizedSession(credentials)

In [8]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel

vertexai.init(project=GCP_PROJECT_NAME, location="us-central1")
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 126,
    "temperature": 0,
    "top_p": 1
}
model = TextGenerationModel.from_pretrained("text-bison")


## Basic french address

In [9]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 1, rue de la Paix, 75008 Paris
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "1",
  "street_type": "rue",
  "street_name": "de la Paix",
  "zip_code": "75008",
  "city": "Paris",
  "confidence": 1
}


### Unusual French address

In [10]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 78 bis Cavee des Ecameaux 76500 Elbeuf
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "78",
  "street_multiplier": "bis",
  "street_type": "Cavee",
  "street_name": "des Ecameaux",
  "zip_code": "76500",
  "city": "Elbeuf",
  "confidence": 1
}


In [364]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 78 bis Cavee des Ecameaux 76500 Elbeuf
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "78",
  "street_multiplier": "bis",
  "street_type": "Cavee",
  "street_name": "des Ecameaux",
  "zip_code": "76500",
  "city": "Elbeuf",
  "confidence": 1
}


## Complex US address

In [365]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: C/o John Doe LLC, 111, 8th Ave Ste 1509, Oklahoma, 74136-1922, USA
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "111",
  "street_multiplier": "8th",
  "street_type": "Ave",
  "street_name": "Ste 1509",
  "zip_code": "74136-1922",
  "city": "Oklahoma",
  "country": "USA",
  "confidence": 1
}


## German address 

generator at: https://tarjeta-credito.net/cvv/address-gen-germany/

In [366]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: Haßfurter Str. 1, 91056 Erlangen, Germany.
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "1",
  "street_type": "Str.",
  "street_name": "Haßfurter",
  "zip_code": "91056",
  "city": "Erlangen",
  "country": "Germany",
  "confidence": 1
}


### german address with Country not mentionned

In [367]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city and country
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: Text: 37, rue du champ du pardon, 76000 rouen
JSON: {
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: {
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: Auschnippe 9, 37170 Uslar
JSON:
""",
    **parameters
)
print(f"Response from Model: {response.text}")

Response from Model:  {
  "street_number": "9",
  "street_name": "Auschnippe",
  "zip_code": "37170",
  "city": "Uslar",
  "country": "Germany",
  "confidence": 1
}


## Confidence level w/ False address

In [368]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: 
{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: 
{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"confidence\": 1
}

Text: ici c'est paris !, 75001 Paris
JSON:

""",
    **parameters
)
print(f"{response.text}")

 {
  "street_number": null,
  "street_type": null,
  "street_name": null,
  "zip_code": "75001",
  "city": "Paris",
  "confidence": 0.5
}


### Confidence level w/ False address and Country not mentioned

In [369]:
response = model.predict(
    """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input

Text: 37, rue du champ du pardon, 76000 rouen
JSON: 
{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 10-82 Cavée St Gervais, 76000 rouen
JSON: 
{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}

Text: 452 quater grande rue general 85000 le palais vendee
JSON:

""",
    **parameters
)
print(f"{response.text}")


 {
  "street_number": "452",
  "street_multiplier": "quater",
  "street_type": "grande rue",
  "street_name": "general",
  "zip_code": "85000",
  "city": "le palais",
  "department": "vendee",
  "country": "France",
  "confidence": 1
}


# Programmatic

Using Langchain we can have a more programmatic way 

In [11]:
!pip install --upgrade --quiet  langchain-core langchain-google-vertexai

In [12]:
from langchain_google_vertexai import VertexAI

# candidate count parameter missing ?
model = VertexAI(model_name="text-bison", top_p=1, temperature=0, max_output_tokens=126)

In [13]:

from langchain_core.pydantic_v1 import BaseModel
from typing import Optional

from langchain_core.output_parsers import JsonOutputParser


class Address(BaseModel):
    street_number: Optional[str]
    street_multiplier: Optional[str]
    street_type: Optional[str]
    street_name: Optional[str]
    locality: Optional[str]  #lieu-dit
    zip_code: Optional[str]
    city: Optional[str]
    country: Optional[str]
    confidence: int


parser = JsonOutputParser(pydantic_object=Address)
# pprint(parser.get_format_instructions())


In [14]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts import PromptTemplate

examples = [
    {
        "address": "37, rue du champ du pardon, 76000 rouen",
        "answer": """{{
  \"street_number\": \"37\",
  \"street_type\": \"rue\",
  \"street_name\": \"du champ du pardon\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}}"""
    },
    {
        "address": "10-82 Cavée St Gervais, hameau de charette 76000 rouen",
        "answer": """{{
  \"street_number\": \"10-82\",
  \"street_type\": \"Cavée\",
  \"street_name\": \"St Gervais\",
  \"locality\": \"hameau de charette\",
  \"zip_code\": \"76000\",
  \"city\": \"rouen\",
  \"country\": \"France\",
  \"confidence\": 1
}}"""
    },
]

prefix = """Extract and split the address parameters from the text below in a JSON format.
be very precise, differentiate the street number, multiplier, street type, street name, zip code, city
provide a rating number to indicate your confidence level. 0 is the lowest confidence, 1 is the maximum.
provide only one response for one address input"""

example_prompt = PromptTemplate(
    template="""Text: {address}\nJSON: {answer}""",
    # template="""Answer the user query.\n{format_instructions}\nText: {address}\nJSON: {answer}\n""",
    input_variables=["address", "answer"],
    # partial_variables={"format_instructions": parser.get_format_instructions()}
)

# print(example_prompt.format(**examples[0]))

prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix="Text: {address}\nJSON:",
    input_variables=["address"],

)

# print(prompt.format(address="78 bis Cavee des Ecameaux 76500 Elbeuf"))


chain = prompt | model | parser

## Manual Testing

In [15]:

address = "1 bis rue de la Paix, 75008 Paris"
Address(**chain.invoke({"address": address}))


Address(street_number='1', street_multiplier='bis', street_type='rue', street_name='de la Paix', locality=None, zip_code='75008', city='Paris', country='France', confidence=1)

In [375]:

address = "Auschnippe 9, 37170 Uslar"
Address(**chain.invoke({"address": address}))


Address(street_number='9', street_multiplier=None, street_type=None, street_name='Auschnippe', locality=None, zip_code='37170', city='Uslar', country='Germany', confidence=1)

In [376]:

address = "C/o John Doe LLC, 111, 8th Ave Ste 1509, Oklahoma, 74136-1922, USA"
add = Address(**chain.invoke({"address": address}))
add


Address(street_number='111', street_multiplier='8th', street_type='Ave', street_name='Ste 1509', locality=None, zip_code='74136-1922', city='Oklahoma', country='USA', confidence=1)

### Serialize / Compress

In [377]:
import sys
from pprint import pprint
import zlib

# size of object in bytes
pprint(sys.getsizeof(add))
pprint(sys.getsizeof(add.json()))
pprint(sys.getsizeof(zlib.compress(add.json().encode())))

56
249
174


## Auto-Testing

pre-requisite: Install dependencies: pandas, pyarrow, etc...

Use of 'base nationale des Adresses' for French addresses (BAN)

#### Exploratory Data Analysis & Cleanup

In [None]:
import dask.dataframe
import pandas as pd
from pprint import pprint
import numpy as np

# df_addresses = pd.read_csv('./dataset/adresses-01.csv', delimiter=';', low_memory=False)
# df_addresses = pd.read_csv('./dataset/adresses-france.csv', delimiter=';', low_memory=False, index_col=0, usecols=['id', 'numero', 'rep', 'nom_voie', 'code_postal', 'nom_commune', 'alias', 'nom_ld', 'libelle_acheminement', 'nom_afnor'])

df = dask.dataframe.read_csv('./dataset/adresses-france.csv', blocksize="100MB", on_bad_lines='skip')
df.to_parquet('./dataset/adresses-france.csv'.replace(".csv", ".parquet"))




KeyboardInterrupt



In [11]:

df_addresses.shape

(265542, 9)

In [12]:
df_addresses.head()

Unnamed: 0_level_0,numero,rep,nom_voie,code_postal,nom_commune,alias,nom_ld,libelle_acheminement,nom_afnor
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01001_4b50r5_00630,630,,la Chèvre,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,LA CHEVRE
01001_g0ru02_00108,108,,Clemencia,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,CLEMENCIA
01001_ngzlqw_00009,9,,Imp des Epis,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMP DES EPIS
01001_ngzlqw_00023,23,,Imp des Epis,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMP DES EPIS
01001_ngzlqw_00026,26,,Imp des Epis,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMP DES EPIS


In [13]:
df_addresses.drop(
    columns=['id_fantoir', 'code_insee', 'code_insee_ancienne_commune', 'nom_ancienne_commune', 'x', 'y', 'lon', 'lat',
             'type_position', 'source_position', 'source_nom_voie', 'certification_commune', 'cad_parcelles'],
    inplace=True)

KeyError: "['id_fantoir', 'code_insee', 'code_insee_ancienne_commune', 'nom_ancienne_commune', 'x', 'y', 'lon', 'lat', 'type_position', 'source_position', 'source_nom_voie', 'certification_commune', 'cad_parcelles'] not found in axis"

In [14]:
df_addresses.replace(np.nan, '', regex=True, inplace=True)
df_addresses.replace('_1', '', regex=True, inplace=True)


In [15]:
df_addresses.drop_duplicates(['rep', 'nom_voie', 'nom_commune', 'nom_ld'], inplace=True)
df_addresses.shape

(36449, 9)

In [16]:
df_addresses.head(100)

Unnamed: 0_level_0,numero,rep,nom_voie,code_postal,nom_commune,alias,nom_ld,libelle_acheminement,nom_afnor
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01001_4b50r5_00630,630,,la Chèvre,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,LA CHEVRE
01001_g0ru02_00108,108,,Clemencia,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,CLEMENCIA
01001_ngzlqw_00009,9,,Imp des Epis,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMP DES EPIS
01001_0165_00019,19,,Route de la Fontaine,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE LA FONTAINE
01001_0165_00080_bis,80,bis,Route de la Fontaine,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE LA FONTAINE
...,...,...,...,...,...,...,...,...,...
01002_0140_00008,8,,Chemin du Lavoir,1640,L'Abergement-de-Varey,,,ABERGEMENT-DE-VAREY (L ),CHEMIN DU LAVOIR
01002_0095_00037,37,,Ruelle de la Flotiere,1640,L'Abergement-de-Varey,,,ABERGEMENT-DE-VAREY (L ),RUELLE DE LA FLOTIERE
01002_0295_00020,20,,Route des Vignes,1640,L'Abergement-de-Varey,,,ABERGEMENT-DE-VAREY (L ),ROUTE DES VIGNES
01002_0145_00002,2,,Chemin Louis Lumiere,1640,L'Abergement-de-Varey,,,ABERGEMENT-DE-VAREY (L ),CHEMIN LOUIS LUMIERE


In [408]:
def dataframe_row_to_pydantic_object(row: pd.Series) -> Address:
    """
    Converts a row from a Pandas DataFrame to a Pydantic object.
    Args:
        row (pd.Series): A single row from the DataFrame.
    Returns:
        SomeModel: A Pydantic object with the data from the row.
    """
    #missing street_type
    return Address(street_number=str(row['numero']), street_multiplier=row['rep'], street_name=row['nom_voie'],
                   zip_code=str(row['code_postal']), city=row['nom_commune'], confidence=1)


In [409]:
dataframe_row_to_pydantic_object(df_addresses.iloc[0])

Address(street_number='630', street_multiplier='', street_type=None, street_name='la Chèvre', locality=None, zip_code='1400', city="L'Abergement-Clémenciat", country=None, confidence=1)

In [410]:
def full_address_generator(row: pd.Series) -> str:
    """
    Generates a one line address based on all the available fields from BAN
    :param row: pd.Series of one address from BAN
    :return: 
    """
    if row['nom_ld'] != '':
        row['nom_ld'] = ', ' + row['nom_ld'] #todo here we are cheating a little bit, in order to differentiate the locality (lieu-dit) we insert a comma, actually otherwise even for a human it would be very difficult to differentiate ! 
    strings = [str(row['numero']), str(row['rep']), str(row['nom_voie']), str(row['nom_ld']), str(row['code_postal']),
               str(row['nom_commune'])]
    return ' '.join(filter(None, strings))
    # return f"{row['numero']}{('',' '+str(row['rep']))[row['rep'] == 'nan']}{' ' + row['nom_voie']}{('', ', '+str(row['nom_ld']))[row['nom_ld'] == 'nan']}{' '+str(row['code_postal'])}{' ' + row['nom_commune']}"

In [411]:

index = 4568
pprint(df_addresses.iloc[index])
full_address_generator(df_addresses.iloc[index])

id                      01407_0054_00001
numero                                 1
rep                                     
nom_voie                  Allee Serrulaz
code_postal                         1420
nom_commune                      Seyssel
alias                                   
nom_ld                                  
libelle_acheminement             SEYSSEL
nom_afnor                 ALLEE SERRULAZ
Name: 34853, dtype: object


'1 Allee Serrulaz 1420 Seyssel'

Add one line address to Dataframe

In [412]:
df_addresses['full_add'] = df_addresses.apply(lambda _row: full_address_generator(_row), axis=1)

In [413]:
df_addresses.head(50)

Unnamed: 0,id,numero,rep,nom_voie,code_postal,nom_commune,alias,nom_ld,libelle_acheminement,nom_afnor,full_add
0,01001_4b50r5_00630,630,,la Chèvre,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,LA CHEVRE,630 la Chèvre 1400 L'Abergement-Clémenciat
1,01001_g0ru02_00108,108,,Clemencia,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,CLEMENCIA,108 Clemencia 1400 L'Abergement-Clémenciat
2,01001_ngzlqw_00009,9,,Imp des Epis,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMP DES EPIS,9 Imp des Epis 1400 L'Abergement-Clémenciat
12,01001_0165_00019,19,,Route de la Fontaine,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE LA FONTAINE,19 Route de la Fontaine 1400 L'Abergement-Clém...
43,01001_0165_00080_bis,80,bis,Route de la Fontaine,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE LA FONTAINE,80 bis Route de la Fontaine 1400 L'Abergement-...
44,01001_0115_00095,95,,Route de Clémenciat,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE CLEMENCIAT,95 Route de Clémenciat 1400 L'Abergement-Cléme...
59,01001_0115_00218_c,218,c,Route de Clémenciat,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,ROUTE DE CLEMENCIAT,218 c Route de Clémenciat 1400 L'Abergement-Cl...
60,01001_0370_00036,36,,Impasse des Soyeux,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMPASSE DES SOYEUX,36 Impasse des Soyeux 1400 L'Abergement-Clémen...
66,01001_0370_00038_bis,38,bis,Impasse des Soyeux,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMPASSE DES SOYEUX,38 bis Impasse des Soyeux 1400 L'Abergement-Cl...
67,01001_0250_00024,24,,Impasse des Merles,1400,L'Abergement-Clémenciat,,,L'ABERGEMENT-CLEMENCIAT,IMPASSE DES MERLES,24 Impasse des Merles 1400 L'Abergement-Clémen...


### Scoring on large dataset

In [414]:
def score_ban_from_split(split: Address, ban: pd.Series, _print: bool = False) -> tuple[int, int]:
    """
    Defines a score to compare an Address object and a pd.Series of BAN
    :param split: Address object
    :param ban: pd.Series row of BAN Dataframe
    :param _print: print the input objects (debug purposes)
    :return: score from 0 to 6 and _score one hot encoded from 0 to 111111
    """
    if _print: 
        print(split)
        print(ban)
    _score = 0
    if split.street_number == str(ban['numero']): _score += 1
    if (not (bool(split.street_multiplier) | bool(ban['rep']))) | (
            split.street_multiplier == str(ban['rep'])): _score += 10
    if (split.street_type + ' ' + split.street_name) == str(ban['nom_voie']): _score += 100
    if (not (bool(split.locality) | bool(ban['nom_ld']))) | (split.locality == ban['nom_ld']): _score += 1000
    if split.zip_code == str(ban['code_postal']): _score += 10000
    if split.city == str(ban['nom_commune']): _score += 100000
    return str(_score).count('1'), _score


In [415]:
row = df_addresses.iloc[6892]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

street_number='132' street_multiplier=None street_type='Chemin' street_name='du Champ Fleuret' locality=None zip_code='1310' city='Saint-Martin-le-Châtel' country='France' confidence=1
id                                                       01375_0095_00132
numero                                                                132
rep                                                                      
nom_voie                                          Chemin du Champ Fleuret
code_postal                                                          1310
nom_commune                                        Saint-Martin-le-Châtel
alias                                                                    
nom_ld                                                                   
libelle_acheminement                               SAINT-MARTIN-LE-CHATEL
nom_afnor                                         CHEMIN DU CHAMP FLEURET
full_add                132 Chemin du Champ Fleuret 1310 Saint-Martin-...
N

(6, 111111)

In [416]:
row = df_addresses.iloc[4564]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

street_number='1' street_multiplier=None street_type='Montee' street_name='de Perouse' locality=None zip_code='1420' city='Seyssel' country='France' confidence=1
id                                      01407_0042_00001
numero                                                 1
rep                                                     
nom_voie                               Montee de Perouse
code_postal                                         1420
nom_commune                                      Seyssel
alias                                                   
nom_ld                                                  
libelle_acheminement                             SEYSSEL
nom_afnor                              MONTEE DE PEROUSE
full_add                1 Montee de Perouse 1420 Seyssel
Name: 34832, dtype: object


(6, 111111)

In [417]:
row = df_addresses.iloc[5521]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

street_number='96' street_multiplier=None street_type='Chemin' street_name='des Capettes' locality=None zip_code='1270' city='Salavre' country='France' confidence=1
id                                         01391_0041_00096
numero                                                   96
rep                                                        
nom_voie                                Chemin des Capettes
code_postal                                            1270
nom_commune                                         Salavre
alias                                                      
nom_ld                                                     
libelle_acheminement                                SALAVRE
nom_afnor                               CHEMIN DES CAPETTES
full_add                96 Chemin des Capettes 1270 Salavre
Name: 41617, dtype: object


(6, 111111)

In [418]:
row = df_addresses.iloc[11123]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

street_number='45' street_multiplier='bis' street_type='Route' street_name='de Ceyzériat' locality=None zip_code='1250' city='Revonnas' country='France' confidence=1
id                                         01321_0045_00045_bis
numero                                                       45
rep                                                         bis
nom_voie                                     Route de Ceyzériat
code_postal                                                1250
nom_commune                                            Revonnas
alias                                                          
nom_ld                                                         
libelle_acheminement                                   REVONNAS
nom_afnor                                    ROUTE DE CEYZERIAT
full_add                45 bis Route de Ceyzériat 1250 Revonnas
Name: 79434, dtype: object


(6, 111111)

In [419]:
row = df_addresses.iloc[5546]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, 1)
score

street_number='611' street_multiplier='bis' street_type='Chemin' street_name='de Cleyriat' locality=None zip_code='1270' city='Salavre' country='France' confidence=1
id                                         01391_0056_00611_bis
numero                                                      611
rep                                                         bis
nom_voie                                     Chemin de Cleyriat
code_postal                                                1270
nom_commune                                             Salavre
alias                                                          
nom_ld                                                         
libelle_acheminement                                    SALAVRE
nom_afnor                                    CHEMIN DE CLEYRIAT
full_add                611 bis Chemin de Cleyriat 1270 Salavre
Name: 41739, dtype: object


(6, 111111)

#### Score Compute

We need to take a portion of the data set in order to minimize the cost
we expect the 'score' column to contain the maximum possible returned e.g. (6, 111111)

In [420]:
df_addresses.shape

(36449, 11)

In [421]:
df_test = df_addresses.sample(100).copy()
df_test.shape

(100, 11)

In [422]:
def score_row(_row : pd.Series) -> tuple[int, int]:
    _split_adr = Address(**chain.invoke({"address": row['full_add']}))
    return score_ban_from_split(_split_adr, row)

In [423]:
df_test['score'] = df_test.apply(lambda _row: score_row(_row), axis=1)

In [424]:
df_test.head(100)

Unnamed: 0,id,numero,rep,nom_voie,code_postal,nom_commune,alias,nom_ld,libelle_acheminement,nom_afnor,full_add,score
31144,01419_0029_00003,3,,Impasse du Bout des Champs,1710,Thoiry,,,THOIRY,IMPASSE DU BOUT DES CHAMPS,3 Impasse du Bout des Champs 1710 Thoiry,"(6, 111111)"
72555,01342_0012_00011,11,,Impasse des Berges,1120,Sainte-Croix,,Sainte Croix,SAINTE-CROIX,IMPASSE DES BERGES,"11 Impasse des Berges , Sainte Croix 1120 Sain...","(6, 111111)"
206592,01093_0267_00032,32,,Impasse du Lac,1400,Châtillon-sur-Chalaronne,,,CHATILLON-SUR-CHALARONNE,IMPASSE DU LAC,32 Impasse du Lac 1400 Châtillon-sur-Chalaronne,"(6, 111111)"
77770,01328_0800_00052,52,,Impasse du Petit Chapuis,1400,Romans,,,ROMANS,IMPASSE DU PETIT CHAPUIS,52 Impasse du Petit Chapuis 1400 Romans,"(6, 111111)"
30264,01419_0060_00041_a,41,a,Rue de la collonges,1710,Thoiry,,,THOIRY,RUE DE LA COLLONGES,41 a Rue de la collonges 1710 Thoiry,"(6, 111111)"
...,...,...,...,...,...,...,...,...,...,...,...,...
36254,01403_0190_00005_a,5,a,Rue du Verger,1470,Serrières-de-Briord,,,SERRIERES-DE-BRIORD,RUE DU VERGER,5 a Rue du Verger 1470 Serrières-de-Briord,"(6, 111111)"
258605,01024_0010_00694_e,694,e,Route de Bourg,1340,Attignat,,,ATTIGNAT,ROUTE DE BOURG,694 e Route de Bourg 1340 Attignat,"(6, 111111)"
219639,01071_0190_00195_c,195,c,Route de Pitegny,1170,Cessy,,,CESSY,ROUTE DE PITEGNY,195 c Route de Pitegny 1170 Cessy,"(6, 111111)"
218910,01069_0125_00286,286,,Chemin des Peupliers,1240,Certines,,,CERTINES,CHEMIN DES PEUPLIERS,286 Chemin des Peupliers 1240 Certines,"(6, 111111)"
