In [1]:
filename = 'StockEtablissement_utf8_extrait.csv'

In [2]:
import os
data_path = os.getcwd()
print(f"{data_path=}")

data_path='/app/notebooks/rapprochements/SIREN'


In [3]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [44]:
# read the excel data
df = pd.read_csv(os.path.join(data_path, filename), sep=",",
                 dtype={"numeroVoieEtablissement": "string", "typeVoieEtablissement": "string", "codePostalEtablissement": "string"})
df['coordonneeLambertAbscisseEtablissement'] = (
    df['coordonneeLambertAbscisseEtablissement']
    .fillna(0)
    .replace('', '0')
    .replace('[ND]', '0')
    .astype(float)
)

df['coordonneeLambertOrdonneeEtablissement'] = (
    df['coordonneeLambertOrdonneeEtablissement']
    .fillna(0)
    .replace('', '0')
    .replace('[ND]', '0')
    .astype(float)
)

# print the df length
print('The length of the dataframe is: ', len(df))
df.head()
df.dtypes

The length of the dataframe is:  1000


siren                                                      int64
nic                                                        int64
siret                                                      int64
statutDiffusionEtablissement                              object
dateCreationEtablissement                                 object
trancheEffectifsEtablissement                             object
anneeEffectifsEtablissement                              float64
activitePrincipaleRegistreMetiersEtablissement            object
dateDernierTraitementEtablissement                        object
etablissementSiege                                          bool
nombrePeriodesEtablissement                                int64
complementAdresseEtablissement                            object
numeroVoieEtablissement                           string[python]
indiceRepetitionEtablissement                             object
dernierNumeroVoieEtablissement                            object
indiceRepetitionDernierNu

In [35]:
# display df columns
print(df.columns)

Index(['siren', 'nic', 'siret', 'statutDiffusionEtablissement',
       'dateCreationEtablissement', 'trancheEffectifsEtablissement',
       'anneeEffectifsEtablissement',
       'activitePrincipaleRegistreMetiersEtablissement',
       'dateDernierTraitementEtablissement', 'etablissementSiege',
       'nombrePeriodesEtablissement', 'complementAdresseEtablissement',
       'numeroVoieEtablissement', 'indiceRepetitionEtablissement',
       'dernierNumeroVoieEtablissement',
       'indiceRepetitionDernierNumeroVoieEtablissement',
       'typeVoieEtablissement', 'libelleVoieEtablissement',
       'codePostalEtablissement', 'libelleCommuneEtablissement',
       'libelleCommuneEtrangerEtablissement',
       'distributionSpecialeEtablissement', 'codeCommuneEtablissement',
       'codeCedexEtablissement', 'libelleCedexEtablissement',
       'codePaysEtrangerEtablissement', 'libellePaysEtrangerEtablissement',
       'identifiantAdresseEtablissement',
       'coordonneeLambertAbscisseEtablissemen

# map to inputs

In [45]:
def should_include_row(df_row_raw):
    return True

filtered_df = df[df.apply(should_include_row, axis=1)]

In [46]:
print(f"{len(filtered_df)=}")

len(filtered_df)=1000


In [64]:
from pyproj import Transformer

# Définir les systèmes de coordonnées source (Lambert) et cible (WGS84)
transformer = Transformer.from_crs("EPSG:2154", "EPSG:4326", always_xy=True)

def row_to_input(df_row_raw):
    df_row = dict(df_row_raw)
    x_lambert = df_row["coordonneeLambertAbscisseEtablissement"]
    y_lambert = df_row["coordonneeLambertOrdonneeEtablissement"]
    
    #print(f"{x_lambert=} {y_lambert=}")
    lng, lat = None, None
    if x_lambert != 0 and y_lambert != 0:
        lng, lat = transformer.transform(float(x_lambert), float(y_lambert))
    
    if pd.isna(df_row["numeroVoieEtablissement"]) or df_row["numeroVoieEtablissement"] == "[ND]":
        address = None
    else:
        #print(f"{type(df_row["numeroVoieEtablissement"])=} {type(df_row["typeVoieEtablissement"])=} {type(df_row["libelleVoieEtablissement"])=} {type(df_row["codePostalEtablissement"])=} {type(df["libelleCommuneEtablissement"])=}")
        address = df_row["numeroVoieEtablissement"] + \
        " " + df_row["typeVoieEtablissement"] + " " + df_row["libelleVoieEtablissement"] + \
        ", " + df_row["codePostalEtablissement"] + " " + df_row["libelleCommuneEtablissement"]
    
    if pd.isna(address):
        address = None
        
    return {
        "ext_id" : df_row["siret"],
        "address": address,
        "lat": lat,
        "lng": lng
    }


inputs = list(filtered_df.apply(row_to_input, axis=1))

In [68]:
import random
sample_inputs = random.sample(inputs, 1000)
sample_inputs

[{'ext_id': 92266293700015, 'address': None, 'lat': None, 'lng': None},
 {'ext_id': 93299058300019,
  'address': '24 RUE MARCEL BOURDARIAS, 94140 ALFORTVILLE',
  'lat': 48.809013003430074,
  'lng': 2.421210039924512},
 {'ext_id': 90267098300029, 'address': None, 'lat': None, 'lng': None},
 {'ext_id': 35070653700015, 'address': None, 'lat': None, 'lng': None},
 {'ext_id': 32908710000015,
  'address': '32 RUE WASHINGTON, 75008 PARIS',
  'lat': 48.87356899999998,
  'lng': 2.303777},
 {'ext_id': 32705827700036,
  'address': "13 CHEMIN DES PRES SECS, 69380 CIVRIEUX-D'AZERGUES",
  'lat': 45.854496999999995,
  'lng': 4.700476999999999},
 {'ext_id': 80297014500032,
  'address': '14 RUE COPERNIC, 75016 PARIS',
  'lat': 48.86913399999999,
  'lng': 2.290244},
 {'ext_id': 44940258500029,
  'address': '14 RUE FERMAT, 75014 PARIS 14',
  'lat': None,
  'lng': None},
 {'ext_id': 41067543300016,
  'address': '102 AVENUE DES CHAMPS ELYSEES, 75008 PARIS',
  'lat': 48.871833999999986,
  'lng': 2.302297000

In [69]:
from batid.services.guess_bdg_new import Guesser, GeocodeNameHandler, GeocodeAddressHandler, ClosestFromPointHandler
guesses_path = "./guesses.json"
guesser = Guesser(batch_size=90)
guesser.handlers = [
    GeocodeAddressHandler(),
    ClosestFromPointHandler()
]
guesser.create_work_file(list(sample_inputs), guesses_path)
guesser.guess_work_file(guesses_path)

  0%|          | 0/12 [00:00<?, ?it/s]

In [70]:
guesser = Guesser()
guesser.load_work_file(guesses_path)
guesser.report()

-- Report --
Number of rows: 1000
Number of match: 571 (57.10%)

-- finished_steps --
Rows with finished_steps closest_from_point: 1000 (100.00%)
Rows with finished_steps geocode_address: 1000 (100.00%)
Rows with finished_steps geocode_name: 0 (0.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
precise_address_match    535
point_on_bdg              18
isolated_closest_bdg      18
Name: count, dtype: int64

-- match_reasons : % --
match_reason
precise_address_match    53.5
point_on_bdg              1.8
isolated_closest_bdg      1.8
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 584 (58.40%)


In [13]:
guesser.to_csv(os.path.join(data_path, 'results.csv'), ext_id_col_name="ext_id")