In [1]:
import os
import csv
import pandas as pd
import concurrent.futures

from batid.services.guess_bdg_new import Guesser, GeocodeNameHandler, GeocodeAddressHandler, AbstractHandler, Guess
from IPython.display import display, HTML

from django.contrib.gis.geos import Point
from django.contrib.gis.db.models.functions import Distance

efa_iub_path = "EFA_IUB.csv"
efa_surface_path = "EFA_SURFACE.csv"
efa_path = "EFA.csv"

geocoded_addresses_path = "geocoded_addresses.csv"

guess_path = "guesses.json"

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [14]:
# EFA IUB

df = pd.read_csv(efa_iub_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

print("Dénomination bâtiment")
print(df["Dénomination bâtiment"].value_counts())

Index(['ID EFA', 'Péfixe', 'Section', 'Dénomination bâtiment', 'Parcelle',
       'Lot'],
      dtype='object')
   ID EFA Péfixe Section Dénomination bâtiment Parcelle       Lot
0    1812    000      BY            ARGENTEUIL     0589      0000
1    1812    000      BY            ARGENTEUIL     0628      0000
2    1812    000      BY            ARGENTEUIL     0630      0000
3    1813    115      BS           HIGH SQUARE     0168      0000
4    1813    115      BS           HIGH SQUARE     0121      0000
5    1535    107      AD                230STG     0010  STEINWAY
6    1536    107      AD                230STG     0010     AARPI
7    1803    108      AQ                   9RP     0077        ID
8    1807    108      AQ                   9RP     0077      GIMD
9    1806    108      AQ                   7RP     0077       7RP
413482 lignes
Dénomination bâtiment
Dénomination bâtiment
1                             2204
CENTRE COMMERCIAL             1789
MAIRIE                        1476

In [15]:
# EFA Surface

df = pd.read_csv(efa_surface_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

Index(['ID Déclaration consommation', 'Année consommation', 'ID EFA',
       'Identifiant occupant', 'Nom occupant EFA',
       'Surface moyenne annuelle EFA (m²)'],
      dtype='object')
   ID Déclaration consommation Année consommation  ID EFA  \
0                        32622               2020   44726   
1                       256709               2020  148143   
2                       130290               2020  153793   
3                      1116939               2022  435727   
4                      1144326               2023   88752   
5                       872624               2022  130758   
6                      1276576               2023  228156   
7                       957092               2022  225056   
8                       795358               2022  225921   
9                       323652               2021  243745   

  Identifiant occupant                Nom occupant EFA  \
0       34181606401147                     CHAUSS'EXPO   
1       67850126300175  

In [25]:
# EFA

df = pd.read_csv(efa_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

# A propos de cas_assujetissement
print("Cas assujettissement")
print(df["Cas assujettissement"].value_counts())
print("Complément adresse OPR")
print(df["Complément adresse OPR"].value_counts())



  df = pd.read_csv(efa_path, sep=";")


Index(['ID EFA', 'Cas assujettissement', 'Identifiant occupant',
       'Nom occupant EFA', 'Complément nom EFA', 'Dénomination EFA',
       'Adresse (Operat)', 'Complément adresse OPR', 'Commune OPR',
       'Code commune OPR', 'Code postal OPR', 'Siège SIRN',
       'Nom établissement SIRN', 'Enseigne établissement SIRN', 'Adresse SIRN',
       'Complt adresse SIRN', 'Code postal SIRN', 'Code commune SIRN',
       'Commune maj SIRN', 'Commune SIRN'],
      dtype='object')
   ID EFA Cas assujettissement        Identifiant occupant  \
0       5                    3              48458811600018   
1       6                   1A              77568873203099   
2       8                    2              80852873100025   
3       9                    2              80852873100025   
4      11                   1A              21200004600012   
5      28                   1A              83271794600012   
6      33                   1A              86680121000026   
7      34                

Il est probable qu'on utilise uniquement le fichier EFA.csv

Dedans, on a à dispo : 

- Nom occupant EFA
- Complément nom EFA

On pourra peut être les utiliser avec un Photon

- Les champs Adresse Operat à géocoder

Pour une recherche par adresse. 
On va devoir géocoder les adresses pour obtenir un lat/lng à utiliser avec Photon

In [61]:
df = pd.read_csv(efa_path, sep=";")
df = df[df["ID EFA"] == 2916]

print(df)


  df = pd.read_csv(efa_path, sep=";")


      ID EFA Cas assujettissement Identifiant occupant  \
1439    2916                    3       19660437500010   

             Nom occupant EFA  \
1439  UNIVERSITE DE PERPIGNAN   

                                   Complément nom EFA Dénomination EFA  \
1439  IMMEUBLE DELACROIX ANCIENNE ECOLE MADAME ROLAND              NaN   

      Adresse (Operat) Complément adresse OPR Commune OPR Code commune OPR  \
1439  52 AV PAUL ALDUY                    NaN   PERPIGNAN            66136   

     Code postal OPR  Siège SIRN Nom établissement SIRN  \
1439           66100         1.0                    NaN   

     Enseigne établissement SIRN          Adresse SIRN Complt adresse SIRN  \
1439                         NaN  52 AVENUE PAUL ALDUY                 NaN   

      Code postal SIRN Code commune SIRN Commune maj SIRN Commune SIRN  
1439           66100.0             66136        PERPIGNAN    Perpignan  


# Géocoder les adresses

In [26]:
# On va tenter d'utiliser le géocodeur Photon mais on n'a pas de latitude/longitude.
# On va aller chercher lat/lng en géocodant les adresses fournies





df["full_address"] = df["Adresse (Operat)"].astype(str) + " " + df["Code postal OPR"].astype(str) + " " + df["Commune OPR"].astype(str)

new_df = df[['ID EFA', 'full_address' ]]
new_df.to_csv(geocoded_addresses_path)




In [29]:
# Split in chunks

chunks_size = 50_000
chunks = [new_df[i:i+chunks_size] for i in range(0, len(new_df), chunks_size)]

# Example: save each chunk into a separate CSV
for idx, chunk in enumerate(chunks):
    chunk.to_csv(f'output_chunk_{idx}.csv', index=False)

In [30]:
from batid.services.geocoders import BanBatchGeocoder
import glob
from io import StringIO

geocoder = BanBatchGeocoder()

files = sorted(glob.glob('output_chunk_*.csv'))
for file in files:
    print(file)
    response = geocoder.geocode_file(file, columns=["full_address"])
    response_csv = StringIO(response.text)
    df_geocoded = pd.read_csv(response_csv, sep=',')
    df_geocoded.to_csv(file, index=False)


    
        




output_chunk_0.csv
output_chunk_1.csv
output_chunk_2.csv
output_chunk_3.csv
output_chunk_4.csv
output_chunk_5.csv
output_chunk_6.csv


In [31]:
# merge all file into one
files = sorted(glob.glob('output_chunk_*.csv'))

df_list = [pd.read_csv(file) for file in files]

merged_df = pd.concat(df_list, ignore_index=True)

# Save into a new CSV
merged_df.to_csv(geocoded_addresses_path, index=False)

In [32]:
# remove the chunks
import os

for file in sorted(glob.glob('output_chunk_*.csv')):
    os.remove(file)

# Attacher les adresses géocodées aux données EFA

In [36]:
# On merge

efa_df = pd.read_csv(efa_path, sep=";")
address_df = pd.read_csv(geocoded_addresses_path)

merged_df = pd.merge(efa_df, address_df, on="ID EFA", how="inner")

  efa_df = pd.read_csv(efa_path, sep=";")


   ID EFA Cas assujettissement Identifiant occupant          Nom occupant EFA  \
0       5                    3       48458811600018       ROUSSELOT ANGOULEME   
1       6                   1A       77568873203099       APF FRANCE HANDICAP   
2       8                    2       80852873100025  GROUPE ROCHER OPERATIONS   
3       9                    2       80852873100025  GROUPE ROCHER OPERATIONS   
4      11                   1A       21200004600012                    MAIRIE   

  Complément nom EFA Dénomination EFA              Adresse (Operat)  \
0                NaN              NaN  RUE SAINT MICHEL A ANGOULEME   
1                NaN              NaN  17 BOULEVARD AUGUSTE BLANQUI   
2                NaN              NaN          LA CROIX DES ARCHERS   
3                NaN              NaN          LA CROIX DES ARCHERS   
4     HOTEL DE VILLE              NaN                       PL FOCH   

  Complément adresse OPR Commune OPR Code commune OPR  ...  \
0                    NaN

In [38]:
print(merged_df.columns)
print(merged_df.head())
print(f"merged lines : {len(merged_df)}")

Index(['ID EFA', 'Cas assujettissement', 'Identifiant occupant',
       'Nom occupant EFA', 'Complément nom EFA', 'Dénomination EFA',
       'Adresse (Operat)', 'Complément adresse OPR', 'Commune OPR',
       'Code commune OPR', 'Code postal OPR', 'Siège SIRN',
       'Nom établissement SIRN', 'Enseigne établissement SIRN', 'Adresse SIRN',
       'Complt adresse SIRN', 'Code postal SIRN', 'Code commune SIRN',
       'Commune maj SIRN', 'Commune SIRN', 'full_address', 'longitude',
       'latitude', 'result_score', 'result_score_next', 'result_label',
       'result_type', 'result_id', 'result_housenumber', 'result_name',
       'result_street', 'result_postcode', 'result_city', 'result_context',
       'result_citycode', 'result_oldcitycode', 'result_oldcity',
       'result_district', 'result_status'],
      dtype='object')
   ID EFA Cas assujettissement Identifiant occupant          Nom occupant EFA  \
0       5                    3       48458811600018       ROUSSELOT ANGOULEME   
1

In [40]:
geocoded_efa_df = merged_df[merged_df["result_score"] >= 0.8]
print(f"geocoded lines : {len(geocoded_efa_df)}")

214065
geocoded lines : 214065


In [44]:
def row_to_input(row):
    return {
        "ext_id": row["ID EFA"],
        "lat": row["latitude"],
        "lng": row["longitude"],
        "name": row["Nom occupant EFA"],
        "ban_id": row["result_id"] if row["result_type"] == "housenumber" else None
    }

In [47]:
inputs = []

for _, row in geocoded_efa_df.iterrows():
    inputs.append(row_to_input(row))

guesser = Guesser()
guesser.create_work_file(inputs, guess_path)


In [2]:
guesser = Guesser(batch_size=500)
guesser.handlers = [
    GeocodeNameHandler(sleep_time=0, photon_url="http://host.docker.internal:2322/api/", bbox_apothem_in_meters=1000), 
    GeocodeAddressHandler()
]

guesser.guess_work_file(guess_path)

  0%|          | 0/429 [00:00<?, ?it/s]

In [3]:
g = Guesser()
g.load_work_file(guess_path)

g.report()

-- Report --
Number of rows: 214065
Number of match: 163292 (76.28%)

-- finished_steps --
Rows with finished_steps closest_from_point: 0 (0.00%)
Rows with finished_steps geocode_address: 214065 (100.00%)
Rows with finished_steps geocode_name: 214065 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
precise_address_match                     100111
found_name_in_osm_point_on_bdg             59469
found_name_in_osm_isolated_closest_bdg      3712
Name: count, dtype: int64

-- match_reasons : % --
match_reason
precise_address_match                     46.766636
found_name_in_osm_point_on_bdg            27.780814
found_name_in_osm_isolated_closest_bdg     1.734053
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 157244 (73.46%)


In [4]:
sample_matches = g.matched_sample("found_name_in_osm_point_on_bdg", 20, ["input_ext_id", "matches", "input_name", "input_lat", "input_lng"],)

geocoded_addresses = pd.read_csv(geocoded_addresses_path)

sample_matches["input_ext_id"] = sample_matches["input_ext_id"].astype(str)
geocoded_addresses["ID EFA"] = geocoded_addresses["ID EFA"].astype(str)

merged = pd.merge(sample_matches, geocoded_addresses, left_on="input_ext_id", right_on="ID EFA", how="inner")



display(HTML(merged.to_html()))

Unnamed: 0,input_ext_id,matches,input_name,input_lat,input_lng,ID EFA,full_address,longitude,latitude,result_score,result_score_next,result_label,result_type,result_id,result_housenumber,result_name,result_street,result_postcode,result_city,result_context,result_citycode,result_oldcitycode,result_oldcity,result_district,result_status
0,230154,[3Q8788FYK5P6],CITY ZOO MAXI ZOO,48.589158,7.695902,230154,38 RUE CHARLES PEGUY 67200.0 STRASBOURG,7.695902,48.589158,0.927499,,38 Rue Charles Péguy 67200 Strasbourg,housenumber,67482_1203_00038,38.0,38 Rue Charles Péguy,Rue Charles Péguy,67200.0,Strasbourg,"67, Bas-Rhin, Grand Est",67482,,,,ok
1,378925,[CAH62WPMHQCE],MICROMANIA,50.377523,3.478283,378925,CTRE COMMERCIAL AUCHAN 59494.0 PETITE FORET,3.478283,50.377523,0.824805,0.733251,Centre commercial Auchan 59494 Petite-Forêt,street,59459_7e9ust,,Centre commercial Auchan,Centre commercial Auchan,59494.0,Petite-Forêt,"59, Nord, Hauts-de-France",59459,,,,ok
2,17942,[RG64628H2HP6],BRED BANQUE POPULAIRE,48.819547,2.46226,17942,4 ROUTE DE LA PYRAMIDE 75012 PARIS,2.46226,48.819547,0.975912,0.663065,4 Route de la Pyramide 75012 Paris,housenumber,75112_pa81ne_00004,4.0,4 Route de la Pyramide,Route de la Pyramide,75012.0,Paris,"75, Paris, Île-de-France",75112,,,Paris 12e Arrondissement,ok
3,89489,[JCTX5BBSDDSC],MONOPRIX,48.884003,2.298069,89489,159 RUE DE COURCELLES 75017.0 PARIS 17,2.298069,48.884003,0.857511,0.630776,159 Rue de Courcelles 75017 Paris,housenumber,75117_2387_00159,159.0,159 Rue de Courcelles,Rue de Courcelles,75017.0,Paris,"75, Paris, Île-de-France",75117,,,Paris 17e Arrondissement,ok
4,79230,[TJ8EZ4ZA4D1Z],MICROMANIA,45.774838,3.083678,79230,18 RUE D ALLAGNAT 63000.0 CLERMONT FERRAND,3.083678,45.774838,0.917008,0.727074,18 rue d'Allagnat 63000 Clermont-Ferrand,housenumber,63113_0140_00018,18.0,18 rue d'Allagnat,rue d'Allagnat,63000.0,Clermont-Ferrand,"63, Puy-de-Dôme, Auvergne-Rhône-Alpes",63113,,,,ok
5,182334,[AAE82HZP4JDB],B108 MEDIATHEQUE KERCADO,47.64875,-2.771417,182334,1 PLACE DE CUXHAVEN 56000.0 VANNES,-2.771417,47.64875,0.906755,,1 Place de Cuxhaven 56000 Vannes,housenumber,56260_0845_00001,1.0,1 Place de Cuxhaven,Place de Cuxhaven,56000.0,Vannes,"56, Morbihan, Bretagne",56260,,,,ok
6,239797,[TCAJ6Z6E1V21],SERVICE D'INCENDIE ET DE SECOURS DU BAS-RHIN,48.592994,7.66984,239797,2 ROUTE DE PARIS 67202.0 WOLFISHEIM,7.66984,48.592994,0.894838,,2 Route de Paris 67202 Wolfisheim,housenumber,67551_0447_00002,2.0,2 Route de Paris,Route de Paris,67202.0,Wolfisheim,"67, Bas-Rhin, Grand Est",67551,,,,ok
7,246163,[SEX7Q4W1QG68],LYCEE POLYVALENT JOSEPH FOURIER,47.791452,3.551721,246163,16 RUE POINCARE 89000.0 AUXERRE,3.551721,47.791452,0.901069,,16 Rue Poincaré 89000 Auxerre,housenumber,89024_5610_00016,16.0,16 Rue Poincaré,Rue Poincaré,89000.0,Auxerre,"89, Yonne, Bourgogne-Franche-Comté",89024,,,,ok
8,346399,[C5YKH2SQJYW3],LA BOULANGERIE DE L'EUROPE,49.29015,4.017772,346399,1 RUE LOUIS VEREL 51100.0 REIMS,4.017772,49.29015,0.901607,,1 Rue Louis Verel 51100 Reims,housenumber,51454_5634_00001,1.0,1 Rue Louis Verel,Rue Louis Verel,51100.0,Reims,"51, Marne, Grand Est",51454,,,,ok
9,62854,[PZTSF87XAAGX],HOTEL DE VILLE,47.080385,2.398769,62854,11 RUE JACQUES RIMBAULT 18000.0 BOURGES,2.398769,47.080385,0.916558,,11 Rue Jacques Rimbault 18000 Bourges,housenumber,18033_2955_00011,11.0,11 Rue Jacques Rimbault,Rue Jacques Rimbault,18000.0,Bourges,"18, Cher, Centre-Val de Loire",18033,,,,ok


In [63]:
# Create a custom handler to verify the match is not too far in case of match because of OSM
# see for problematic cases : https://www.notion.so/referentielnationaldesbatiments/Rapprochement-OPERAT-pas-PDL-1c743ec9d11e80379b8efacb2fecc865?pvs=4#1eb43ec9d11e809996cec3269173e97d


class VerifyDistanceHandler(AbstractHandler):
    _name = "verify_distance_handler"
    
    def __init__(self, max_radius, reasons):
        self.max_radius = max_radius
        self.reasons = reasons
    
    def _split_guesses(self, guesses: dict) -> tuple:
        
        to_handle = {}
        not_to_handle = {}

        for ext_id, guess in guesses.items():

            if self.name not in guess["finished_steps"] and len(guess["matches"]) > 0 and guess.get("match_reason") in self.reasons:
                    
                to_handle[ext_id] = guess
            else:
                not_to_handle[ext_id] = guess

        return to_handle, not_to_handle
    
    
    def _guess_batch(self, guesses: dict[str, Guess]) -> dict[str, Guess]:
        tasks = []

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for guess in guesses.values():
                future = executor.submit(self._guess_one, guess)
                # We comment out the line below since closing all connections might provoke with open connection where the query is not yet executed
                # future.add_done_callback(lambda future: connections.close_all())
                tasks.append(future)

            for future in concurrent.futures.as_completed(tasks):
                guess = future.result()
                guesses[guess["input"]["ext_id"]] = guess

        return guesses
    
    def _guess_one(self, guess: Guess) -> Guess:
        
        

        
        matched_rnb_id = guess["matches"][0]
        
        point = Point(guess["input"]["lng"], guess["input"]["lat"], srid=4326)
        
        

        is_in_radius = Building.objects.filter(rnb_id = matched_rnb_id).extra(
            where=[
                f"ST_DWITHIN(shape::geography, ST_MakePoint({guess["input"]["lng"]}, {guess["input"]["lat"]})::geography, {self.max_radius})"
            ]).annotate(distance=Distance("shape", point))
        

                
        if is_in_radius:
            return guess
        
        guess["matches"] = []
        guess["match_reason"] = None

        
        return guess


In [45]:
# testing the handler

dummy_guess = {
    "input": {
        "ext_id": "DUMMY",
        "lat": 47.702699,
        "lng": -1.389652
    },
    "match_reason": "found_name_in_osm_point_on_bdg",
    "matches": ["VWNDXJME8DRH"]
}

handler = VerifyDistanceHandler(110)

guess = handler._guess_one(dummy_guess)

print(guess)





{'inputs': {'ext_id': 'DUMMY', 'lat': 47.702699, 'lng': -1.389652}, 'match_reason': None, 'matches': []}


In [56]:
guesser = Guesser()
guesser.handlers = [VerifyDistanceHandler(110)]

guesser.guess_work_file(guess_path)

  0%|          | 0/43 [00:00<?, ?it/s]

In [57]:
g = Guesser()
g.load_work_file(guess_path)

g.report()

-- Report --
Number of rows: 214065
Number of match: 139944 (65.37%)

-- finished_steps --
Rows with finished_steps closest_from_point: 0 (0.00%)
Rows with finished_steps geocode_address: 214065 (100.00%)
Rows with finished_steps geocode_name: 214065 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
precise_address_match                     100111
found_name_in_osm_point_on_bdg             37370
found_name_in_osm_isolated_closest_bdg      2463
Name: count, dtype: int64

-- match_reasons : % --
match_reason
precise_address_match                     46.766636
found_name_in_osm_point_on_bdg            17.457314
found_name_in_osm_isolated_closest_bdg     1.150585
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 157244 (73.46%)


In [62]:
print(g.guesses.get("2916"))

{'input': {'ext_id': '2916', 'lat': 42.683426, 'lng': 2.901649, 'name': 'UNIVERSITE DE PERPIGNAN', 'ban_id': '66136_0098_00052'}, 'matches': ['S5WPNXGMDP7W', 'SVQ7PBGS8S14', 'XBVW66P3P7ST'], 'match_reason': 'precise_address_match', 'finished_steps': ['geocode_name', 'geocode_address', 'verify_distance_handler']}


In [66]:
guesser = Guesser()

VerifyDistanceHandler.name = "long_distance_verif"

handler = VerifyDistanceHandler(1000, ('precise_address_match'))

guesser.handlers = [handler]

guesser.guess_work_file(guess_path)

  0%|          | 0/43 [00:00<?, ?it/s]

In [67]:
g = Guesser()
g.load_work_file(guess_path)

g.report()

-- Report --
Number of rows: 214065
Number of match: 139944 (65.37%)

-- finished_steps --
Rows with finished_steps closest_from_point: 0 (0.00%)
Rows with finished_steps geocode_address: 214065 (100.00%)
Rows with finished_steps geocode_name: 214065 (100.00%)
Rows with empty finished_steps: 0 (0.00%)

-- match_reasons : absolute --
match_reason
precise_address_match                     100111
found_name_in_osm_point_on_bdg             37370
found_name_in_osm_isolated_closest_bdg      2463
Name: count, dtype: int64

-- match_reasons : % --
match_reason
precise_address_match                     46.766636
found_name_in_osm_point_on_bdg            17.457314
found_name_in_osm_isolated_closest_bdg     1.150585
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 157244 (73.46%)


In [2]:
g = Guesser()
g.load_work_file(guess_path)

g.to_csv('operat_rnb.csv', ext_id_col_name="EFA ID", one_rnb_id_per_row=True)