# Address Parser

Is able to recognize named entities and format outputs based on a defined schema

In [1]:
from dotenv import load_dotenv

load_dotenv()

from langchain_mistralai import ChatMistralAI
model = ChatMistralAI(model="mistral-large-latest")



In [2]:
from langchain_core.messages import HumanMessage
message = [HumanMessage("hello")]
model.invoke(message)

AIMessage(content="Hello! How can I assist you today? If you have any questions or need help with a specific topic, feel free to ask. I'm here to provide information and support.", response_metadata={'token_usage': {'prompt_tokens': 5, 'total_tokens': 42, 'completion_tokens': 37}, 'model': 'mistral-large-latest', 'finish_reason': 'stop'})

In [3]:
import os 

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "FICABA_MISTRAL"

In [7]:

from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Optional


class Address(BaseModel):
    street_number: Optional[str] = Field(description="street number")
    street_number_multiplier: Optional[str] = Field(description="street number multiplier suffix")
    street_type: Optional[str]
    street_name: Optional[str]
    locality: Optional[str]  #lieu-dit
    zip_code: Optional[str]
    city: Optional[str]
    country: Optional[str]
    confidence: int = Field(description="confidence level from 0 to 5 to have correctly extracted the field.")
model_with_structure = model.with_structured_output(Address)


In [32]:
from langchain import callbacks
from langsmith import Client

def filter_inputs(inputs: dict):
    # You can define custom filtering here
    return {}


def filter_outputs(outputs: dict):
    # You can define custom filtering here
    return {}

client = Client(
    api_url=os.getenv("LANGCHAIN_ENDPOINT"),
    api_key=os.getenv("LANGCHAIN_API_KEY"),
    hide_inputs=filter_inputs,
    hide_outputs=filter_outputs
)
    
def split_address(_address:str) -> Address :   
    with callbacks.collect_runs() as cb:
        ret :Address = model_with_structure.invoke(_address)
        run_id = cb.traced_runs[0].id

        client.create_feedback(
            run_id,
            key="confidence_score",  # Updated feedback type
            score=ret.confidence,
            comment=_address,
        )
    return ret


In [30]:
split_address("1, rue de la Paix, 75008 Paris")

Address(street_number='1', street_number_multiplier='', street_type='rue', street_name='de la Paix', locality='', zip_code='75008', city='Paris', country='France', confidence=5)

In [27]:
split_address("78 bis Cavee des Ecameaux 76500 Elbeuf")

Address(street_number='78', street_number_multiplier='bis', street_type=None, street_name='Cavee des Ecameaux', locality=None, zip_code='76500', city='Elbeuf', country=None, confidence=5)

In [16]:
split_address("C/o John Doe LLC, 111, 8th Ave Ste 1509, Oklahoma, 74136-1922, USA")

Address(street_number='111', street_number_multiplier='8th', street_type='Ave', street_name='Ste 1509', locality='Oklahoma', zip_code='74136-1922', city='Oklahoma', country='USA', confidence=5)

In [76]:
split_address("Haßfurter Str. 1, 91056 Erlangen, Germany")


Address(street_number='1', street_number_multiplier=None, street_type=None, street_name='Haßfurter Str.', locality=None, zip_code='91056', city='Erlangen', country='Germany', confidence=5)

In [77]:
split_address("Auschnippe 9, 37170 Uslar")


Address(street_number='9', street_number_multiplier='', street_type='', street_name='Auschnippe', locality='', zip_code='37170', city='Uslar', country='', confidence=5)

In [78]:
split_address("ici c'est paris !, 75001 Paris")


Address(street_number='', street_number_multiplier='', street_type='', street_name='', locality='Paris', zip_code='75001', city='Paris', country='France', confidence=5)

In [14]:
split_address("452 quater grande rue general 85000 le palais vendee")

Address(street_number='452', street_number_multiplier='quater', street_type=None, street_name='grande rue general', locality='le palais', zip_code='85000', city='vendee', country=None, confidence=5)

## Auto-Testing

pre-requisite: Install dependencies: pandas, pyarrow, etc...

Use of 'base nationale des Adresses' for French addresses (BAN)

#### Exploratory Data Analysis & Cleanup

In [35]:
import pandas as pd
from pprint import pprint
import numpy as np

df_addresses = pd.read_parquet('./dataset/adresses-france-cleaned.parquet')


In [36]:

df_addresses.shape

(3393705, 7)

In [37]:
df_addresses.head(100)

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,01001_4b50r5_00630,630,,la Chèvre,01400,L'Abergement-Clémenciat,
1,01001_g0ru02_00108,108,,Clemencia,01400,L'Abergement-Clémenciat,
2,01001_ngzlqw_00009,9,,Imp des Epis,01400,L'Abergement-Clémenciat,
12,01001_0165_00019,19,,Route de la Fontaine,01400,L'Abergement-Clémenciat,
43,01001_0165_00080_bis,80,bis,Route de la Fontaine,01400,L'Abergement-Clémenciat,
...,...,...,...,...,...,...,...
576,01002_0140_00008,8,,Chemin du Lavoir,01640,L'Abergement-de-Varey,
586,01002_0095_00037,37,,Ruelle de la Flotiere,01640,L'Abergement-de-Varey,
591,01002_0295_00020,20,,Route des Vignes,01640,L'Abergement-de-Varey,
598,01002_0145_00002,2,,Chemin Louis Lumiere,01640,L'Abergement-de-Varey,


In [38]:
def dataframe_row_to_pydantic_object(row: pd.Series) -> Address:
    """
    Converts a row from a Pandas DataFrame to a Pydantic object.
    Args:
        row (pd.Series): A single row from the DataFrame.
    Returns:
        SomeModel: A Pydantic object with the data from the row.
    """
    #missing street_type
    return Address(street_number=str(row['numero']), street_multiplier=row['rep'], street_name=row['nom_voie'],
                   zip_code=str(row['code_postal']), city=row['nom_commune'], confidence=1)


In [39]:
dataframe_row_to_pydantic_object(df_addresses.iloc[0])

Address(street_number='630', street_multiplier='', street_type=None, street_name='la Chèvre', locality=None, zip_code='01400', city="L'Abergement-Clémenciat", country=None, confidence=1)

In [40]:
def full_address_generator(row: pd.Series) -> str:
    """
    Generates a one line address based on all the available fields from BAN
    :param row: pd.Series of one address from BAN
    :return: 
    """
    if row['nom_ld'] != '':
        row['nom_ld'] = ', ' + row['nom_ld'] #todo here we are cheating a little bit, in order to differentiate the locality (lieu-dit) we insert a comma, actually otherwise even for a human it would be very difficult to differentiate ! 
    strings = [str(row['numero']), str(row['rep']), str(row['nom_voie']), str(row['nom_ld']), str(row['code_postal']),
               str(row['nom_commune'])]
    return ' '.join(filter(None, strings))
    # return f"{row['numero']}{('',' '+str(row['rep']))[row['rep'] == 'nan']}{' ' + row['nom_voie']}{('', ', '+str(row['nom_ld']))[row['nom_ld'] == 'nan']}{' '+str(row['code_postal'])}{' ' + row['nom_commune']}"

In [41]:

index = 4568
pprint(df_addresses.iloc[index])
full_address_generator(df_addresses.iloc[index])

id               01408_0146_00091
numero                         91
rep                              
nom_voie         Chemin sur Roche
code_postal                 01250
nom_commune    Simandre-sur-Suran
nom_ld                           
Name: 34823, dtype: object


'91 Chemin sur Roche 01250 Simandre-sur-Suran'

Add one line address to Dataframe

In [42]:
df_addresses['full_add'] = df_addresses.apply(lambda _row: full_address_generator(_row), axis=1)

In [45]:
df_addresses.head()

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld,full_add
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,01001_4b50r5_00630,630,,la Chèvre,1400,L'Abergement-Clémenciat,,630 la Chèvre 01400 L'Abergement-Clémenciat
1,01001_g0ru02_00108,108,,Clemencia,1400,L'Abergement-Clémenciat,,108 Clemencia 01400 L'Abergement-Clémenciat
2,01001_ngzlqw_00009,9,,Imp des Epis,1400,L'Abergement-Clémenciat,,9 Imp des Epis 01400 L'Abergement-Clémenciat
12,01001_0165_00019,19,,Route de la Fontaine,1400,L'Abergement-Clémenciat,,19 Route de la Fontaine 01400 L'Abergement-Clé...
43,01001_0165_00080_bis,80,bis,Route de la Fontaine,1400,L'Abergement-Clémenciat,,80 bis Route de la Fontaine 01400 L'Abergement...


### Scoring on large dataset

In [46]:
def score_ban_from_split(split: Address, ban: pd.Series, _print: bool = False) -> tuple[int, int]:
    """
    Defines a score to compare an Address object and a pd.Series of BAN
    :param split: Address object
    :param ban: pd.Series row of BAN Dataframe
    :param _print: print the input objects (debug purposes)
    :return: score from 0 to 6 and _score one hot encoded from 0 to 111111
    """
    if _print: 
        print(split)
        print(ban)
    _score = 0
    if split.street_number == str(ban['numero']): _score += 1
    if (not (bool(split.street_multiplier) | bool(ban['rep']))) | (
            split.street_multiplier == str(ban['rep'])): _score += 10
    if (split.street_type + ' ' + split.street_name) == str(ban['nom_voie']): _score += 100
    if (not (bool(split.locality) | bool(ban['nom_ld']))) | (split.locality == ban['nom_ld']): _score += 1000
    if split.zip_code == str(ban['code_postal']): _score += 10000
    if split.city == str(ban['nom_commune']): _score += 100000
    return str(_score).count('1'), _score


In [47]:
row = df_addresses.iloc[6892]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

NameError: name 'chain' is not defined

In [None]:
row = df_addresses.iloc[4564]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[5521]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[11123]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, True)
score

In [None]:
row = df_addresses.iloc[5546]
split_adr = Address(**chain.invoke({"address": row['full_add']}))
score = score_ban_from_split(split_adr, row, 1)
score

#### Score Compute

We need to take a portion of the data set in order to minimize the cost
we expect the 'score' column to contain the maximum possible returned e.g. (6, 111111)

In [32]:
df_addresses.shape

(3393705, 11)

In [48]:
df_test = df_addresses.sample(200).copy()
df_test.shape

(200, 8)

In [49]:
df_test.head(200)

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld,full_add
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
54684,84007_6630_00011_ter,11,ter,Pl des Trois-Pilats,84000,Avignon,,11 ter Pl des Trois-Pilats 84000 Avignon
368587,33473_58bn1m_00001_bis,1,bis,Drouin,33920,Saint-Savin,,1 bis Drouin 33920 Saint-Savin
196009,27008_0050_00001,1,,Rue des Sablons,27460,Alizay,,1 Rue des Sablons 27460 Alizay
245047,36145_3oge9x_00001,1,,La Blanchetterie,36290,Obterre,,1 La Blanchetterie 36290 Obterre
244272,77201_0060_00014_bis,14,bis,Place de l’Eglise,77370,Gastins,,14 bis Place de l’Eglise 77370 Gastins
...,...,...,...,...,...,...,...,...
424360,59523_nj44rs_00001,1,,Residence des 4 Cantons,59262,Sainghin-en-Mélantois,,1 Residence des 4 Cantons 59262 Sainghin-en-Mé...
454815,61460_0033_00001,1,,Rue Jean Moulin,61470,Sap-en-Auge,le Sap,"1 Rue Jean Moulin , le Sap 61470 Sap-en-Auge"
359666,06152_0295_00020_a,20,a,Avenue Saint Roch,06560,Valbonne,,20 a Avenue Saint Roch 06560 Valbonne
276719,31557742_00004_a,4,a,Allée des Frères Higouneng,31170,Tournefeuille,,4 a Allée des Frères Higouneng 31170 Tournefeu...


In [None]:
def score_row(_row : pd.Series) -> tuple[int, int]:
    _split_adr = Address(**chain.invoke({"address": row['full_add']}))
    return score_ban_from_split(_split_adr, row)

In [None]:
df_test['score'] = df_test.apply(lambda _row: score_row(_row), axis=1)

In [51]:
df_test.head(100)

Unnamed: 0_level_0,id,numero,rep,nom_voie,code_postal,nom_commune,nom_ld,full_add
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
54684,84007_6630_00011_ter,11,ter,Pl des Trois-Pilats,84000,Avignon,,11 ter Pl des Trois-Pilats 84000 Avignon
368587,33473_58bn1m_00001_bis,1,bis,Drouin,33920,Saint-Savin,,1 bis Drouin 33920 Saint-Savin
196009,27008_0050_00001,1,,Rue des Sablons,27460,Alizay,,1 Rue des Sablons 27460 Alizay
245047,36145_3oge9x_00001,1,,La Blanchetterie,36290,Obterre,,1 La Blanchetterie 36290 Obterre
244272,77201_0060_00014_bis,14,bis,Place de l’Eglise,77370,Gastins,,14 bis Place de l’Eglise 77370 Gastins
...,...,...,...,...,...,...,...,...
11926,66017_2400_00017_bis,17,bis,Allee de l’Etang,66420,Le Barcarès,,17 bis Allee de l’Etang 66420 Le Barcarès
538230,49007_2790_00002,2,,Rue Emile Hatais,49100,Angers,,2 Rue Emile Hatais 49100 Angers
23068,56155_0019_00002,2,,Allée de Bellevue,56760,Pénestin,,2 Allée de Bellevue 56760 Pénestin
116246,97401_achsci_00001,1,,Rue des Troenes,97425,Les Avirons,,1 Rue des Troenes 97425 Les Avirons


In [55]:
def scramble_address(_address: str)-> str :
    action = np.random.random_integers(10)
    match action:
        case 1: return _address.lower().replace('rue', 'r')
        case 2: return _address.lower().replace('chemin', 'ch')
        case 3: return _address.lower().replace('impasse', 'imp')
        case 4: return _address.lower().replace('avenue', 'av')
        case 5: return _address.lower().replace('cavée', 'cav')
        case 6: return _address.lower().replace('du', '').replace('des', '').replace('d\'', '')
        case 7: return _address.lower().replace('-', ' ')
        case _: return _address
    

In [56]:
scrambled_df = df_test.head(200).apply(lambda _row: scramble_address(_row['full_add']), axis=1)

  action = np.random.random_integers(10)


In [57]:
scrambled_df

__null_dask_index__
54684              11 ter pl des trois-pilats 84000 avignon
368587                       1 bis drouin 33920 saint-savin
196009                       1 Rue des Sablons 27460 Alizay
245047                     1 la blanchetterie 36290 obterre
244272               14 bis place de l’eglise 77370 gastins
                                ...                        
424360    1 residence des 4 cantons 59262 sainghin-en-mé...
454815         1 rue jean moulin , le sap 61470 sap-en-auge
359666                    20 a av saint roch 06560 valbonne
276719    4 a allée des frères higouneng 31170 tournefeu...
379781          52 bis av du maréchal joffre 47200 marmande
Length: 200, dtype: object

In [2]:
import os
from pprint import pprint
from dotenv import load_dotenv

import googlemaps

load_dotenv()
gmaps = googlemaps.Client(key=os.getenv("GOOGLE_API_KEY"))

addressvalidation_result =  gmaps.addressvalidation(['7 b chemin de la nicerie 16100 châteaubernard'],
                                                    regionCode='FR',
                                                    # locality='Mountain View',
                                                    # enableUspsCass=True
                                                    )

pprint(addressvalidation_result)

{'responseId': 'e2fb2631-9418-49a2-9af6-ba369f67129a',
 'result': {'address': {'addressComponents': [{'componentName': {'text': '7 b'},
                                               'componentType': 'street_number',
                                               'confirmationLevel': 'UNCONFIRMED_BUT_PLAUSIBLE'},
                                              {'componentName': {'languageCode': 'fr',
                                                                 'text': 'chemin '
                                                                         'de '
                                                                         'la '
                                                                         'nicerie'},
                                               'componentType': 'route',
                                               'confirmationLevel': 'UNCONFIRMED_BUT_PLAUSIBLE'},
                                              {'componentName': {'text': '16100'},
                  