In [1]:
import os
import csv
import pandas as pd

from batid.services.guess_bdg_new import Guesser, GeocodeNameHandler, GeocodeAddressHandler
from IPython.display import display, HTML

efa_iub_path = "EFA_IUB.csv"
efa_surface_path = "EFA_SURFACE.csv"
efa_path = "EFA.csv"

geocoded_addresses_path = "geocoded_addresses.csv"

guess_path = "guesses.json"

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

In [14]:
# EFA IUB

df = pd.read_csv(efa_iub_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

print("Dénomination bâtiment")
print(df["Dénomination bâtiment"].value_counts())

Index(['ID EFA', 'Péfixe', 'Section', 'Dénomination bâtiment', 'Parcelle',
       'Lot'],
      dtype='object')
   ID EFA Péfixe Section Dénomination bâtiment Parcelle       Lot
0    1812    000      BY            ARGENTEUIL     0589      0000
1    1812    000      BY            ARGENTEUIL     0628      0000
2    1812    000      BY            ARGENTEUIL     0630      0000
3    1813    115      BS           HIGH SQUARE     0168      0000
4    1813    115      BS           HIGH SQUARE     0121      0000
5    1535    107      AD                230STG     0010  STEINWAY
6    1536    107      AD                230STG     0010     AARPI
7    1803    108      AQ                   9RP     0077        ID
8    1807    108      AQ                   9RP     0077      GIMD
9    1806    108      AQ                   7RP     0077       7RP
413482 lignes
Dénomination bâtiment
Dénomination bâtiment
1                             2204
CENTRE COMMERCIAL             1789
MAIRIE                        1476

In [15]:
# EFA Surface

df = pd.read_csv(efa_surface_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

Index(['ID Déclaration consommation', 'Année consommation', 'ID EFA',
       'Identifiant occupant', 'Nom occupant EFA',
       'Surface moyenne annuelle EFA (m²)'],
      dtype='object')
   ID Déclaration consommation Année consommation  ID EFA  \
0                        32622               2020   44726   
1                       256709               2020  148143   
2                       130290               2020  153793   
3                      1116939               2022  435727   
4                      1144326               2023   88752   
5                       872624               2022  130758   
6                      1276576               2023  228156   
7                       957092               2022  225056   
8                       795358               2022  225921   
9                       323652               2021  243745   

  Identifiant occupant                Nom occupant EFA  \
0       34181606401147                     CHAUSS'EXPO   
1       67850126300175  

In [25]:
# EFA

df = pd.read_csv(efa_path, sep=";")

print(df.columns)
print(df.head(10))
print(f"{len(df)} lignes")

# A propos de cas_assujetissement
print("Cas assujettissement")
print(df["Cas assujettissement"].value_counts())
print("Complément adresse OPR")
print(df["Complément adresse OPR"].value_counts())



  df = pd.read_csv(efa_path, sep=";")


Index(['ID EFA', 'Cas assujettissement', 'Identifiant occupant',
       'Nom occupant EFA', 'Complément nom EFA', 'Dénomination EFA',
       'Adresse (Operat)', 'Complément adresse OPR', 'Commune OPR',
       'Code commune OPR', 'Code postal OPR', 'Siège SIRN',
       'Nom établissement SIRN', 'Enseigne établissement SIRN', 'Adresse SIRN',
       'Complt adresse SIRN', 'Code postal SIRN', 'Code commune SIRN',
       'Commune maj SIRN', 'Commune SIRN'],
      dtype='object')
   ID EFA Cas assujettissement        Identifiant occupant  \
0       5                    3              48458811600018   
1       6                   1A              77568873203099   
2       8                    2              80852873100025   
3       9                    2              80852873100025   
4      11                   1A              21200004600012   
5      28                   1A              83271794600012   
6      33                   1A              86680121000026   
7      34                

Il est probable qu'on utilise uniquement le fichier EFA.csv

Dedans, on a à dispo : 

- Nom occupant EFA
- Complément nom EFA

On pourra peut être les utiliser avec un Photon

- Les champs Adresse Operat à géocoder

Pour une recherche par adresse. 
On va devoir géocoder les adresses pour obtenir un lat/lng à utiliser avec Photon

# Géocoder les adresses

In [26]:
# On va tenter d'utiliser le géocodeur Photon mais on n'a pas de latitude/longitude.
# On va aller chercher lat/lng en géocodant les adresses fournies





df["full_address"] = df["Adresse (Operat)"].astype(str) + " " + df["Code postal OPR"].astype(str) + " " + df["Commune OPR"].astype(str)

new_df = df[['ID EFA', 'full_address' ]]
new_df.to_csv(geocoded_addresses_path)




In [29]:
# Split in chunks

chunks_size = 50_000
chunks = [new_df[i:i+chunks_size] for i in range(0, len(new_df), chunks_size)]

# Example: save each chunk into a separate CSV
for idx, chunk in enumerate(chunks):
    chunk.to_csv(f'output_chunk_{idx}.csv', index=False)

In [30]:
from batid.services.geocoders import BanBatchGeocoder
import glob
from io import StringIO

geocoder = BanBatchGeocoder()

files = sorted(glob.glob('output_chunk_*.csv'))
for file in files:
    print(file)
    response = geocoder.geocode_file(file, columns=["full_address"])
    response_csv = StringIO(response.text)
    df_geocoded = pd.read_csv(response_csv, sep=',')
    df_geocoded.to_csv(file, index=False)


    
        




output_chunk_0.csv
output_chunk_1.csv
output_chunk_2.csv
output_chunk_3.csv
output_chunk_4.csv
output_chunk_5.csv
output_chunk_6.csv


In [31]:
# merge all file into one
files = sorted(glob.glob('output_chunk_*.csv'))

df_list = [pd.read_csv(file) for file in files]

merged_df = pd.concat(df_list, ignore_index=True)

# Save into a new CSV
merged_df.to_csv(geocoded_addresses_path, index=False)

In [32]:
# remove the chunks
import os

for file in sorted(glob.glob('output_chunk_*.csv')):
    os.remove(file)

# Attacher les adresses géocodées aux données EFA

In [36]:
# On merge

efa_df = pd.read_csv(efa_path, sep=";")
address_df = pd.read_csv(geocoded_addresses_path)

merged_df = pd.merge(efa_df, address_df, on="ID EFA", how="inner")

  efa_df = pd.read_csv(efa_path, sep=";")


   ID EFA Cas assujettissement Identifiant occupant          Nom occupant EFA  \
0       5                    3       48458811600018       ROUSSELOT ANGOULEME   
1       6                   1A       77568873203099       APF FRANCE HANDICAP   
2       8                    2       80852873100025  GROUPE ROCHER OPERATIONS   
3       9                    2       80852873100025  GROUPE ROCHER OPERATIONS   
4      11                   1A       21200004600012                    MAIRIE   

  Complément nom EFA Dénomination EFA              Adresse (Operat)  \
0                NaN              NaN  RUE SAINT MICHEL A ANGOULEME   
1                NaN              NaN  17 BOULEVARD AUGUSTE BLANQUI   
2                NaN              NaN          LA CROIX DES ARCHERS   
3                NaN              NaN          LA CROIX DES ARCHERS   
4     HOTEL DE VILLE              NaN                       PL FOCH   

  Complément adresse OPR Commune OPR Code commune OPR  ...  \
0                    NaN

In [38]:
print(merged_df.columns)
print(merged_df.head())
print(f"merged lines : {len(merged_df)}")

Index(['ID EFA', 'Cas assujettissement', 'Identifiant occupant',
       'Nom occupant EFA', 'Complément nom EFA', 'Dénomination EFA',
       'Adresse (Operat)', 'Complément adresse OPR', 'Commune OPR',
       'Code commune OPR', 'Code postal OPR', 'Siège SIRN',
       'Nom établissement SIRN', 'Enseigne établissement SIRN', 'Adresse SIRN',
       'Complt adresse SIRN', 'Code postal SIRN', 'Code commune SIRN',
       'Commune maj SIRN', 'Commune SIRN', 'full_address', 'longitude',
       'latitude', 'result_score', 'result_score_next', 'result_label',
       'result_type', 'result_id', 'result_housenumber', 'result_name',
       'result_street', 'result_postcode', 'result_city', 'result_context',
       'result_citycode', 'result_oldcitycode', 'result_oldcity',
       'result_district', 'result_status'],
      dtype='object')
   ID EFA Cas assujettissement Identifiant occupant          Nom occupant EFA  \
0       5                    3       48458811600018       ROUSSELOT ANGOULEME   
1

In [40]:
geocoded_efa_df = merged_df[merged_df["result_score"] >= 0.8]
print(f"geocoded lines : {len(geocoded_efa_df)}")

214065
geocoded lines : 214065


In [44]:
def row_to_input(row):
    return {
        "ext_id": row["ID EFA"],
        "lat": row["latitude"],
        "lng": row["longitude"],
        "name": row["Nom occupant EFA"],
        "ban_id": row["result_id"] if row["result_type"] == "housenumber" else None
    }

In [47]:
inputs = []

for _, row in geocoded_efa_df.iterrows():
    inputs.append(row_to_input(row))

guesser = Guesser()
guesser.create_work_file(inputs, guess_path)


In [None]:
guesser = Guesser(batch_size=500)
guesser.handlers = [
    GeocodeNameHandler(sleep_time=0, photon_url="http://host.docker.internal:2322/api/", bbox_apothem_in_meters=1000), 
    GeocodeAddressHandler()
]

guesser.guess_work_file(guess_path)

  0%|          | 0/429 [00:00<?, ?it/s]

In [7]:
g = Guesser()
g.load_work_file(guess_path)

g.report()

-- Report --
Number of rows: 214065
Number of match: 44507 (20.79%)

-- finished_steps --
Rows with finished_steps closest_from_point: 0 (0.00%)
Rows with finished_steps geocode_address: 59000 (27.56%)
Rows with finished_steps geocode_name: 59000 (27.56%)
Rows with empty finished_steps: 155065 (72.44%)

-- match_reasons : absolute --
match_reason
precise_address_match                     26171
found_name_in_osm_point_on_bdg            17245
found_name_in_osm_isolated_closest_bdg     1091
Name: count, dtype: int64

-- match_reasons : % --
match_reason
precise_address_match                     12.225726
found_name_in_osm_point_on_bdg             8.055964
found_name_in_osm_isolated_closest_bdg     0.509658
Name: count, dtype: float64

-- Inputs --
rows with ban_id: 157244 (73.46%)


In [14]:
sample_matches = g.matched_sample("found_name_in_osm_point_on_bdg", 20, ["input_ext_id", "matches", "input_name", "input_lat", "input_lng"],)

geocoded_addresses = pd.read_csv(geocoded_addresses_path)

sample_matches["input_ext_id"] = sample_matches["input_ext_id"].astype(str)
geocoded_addresses["ID EFA"] = geocoded_addresses["ID EFA"].astype(str)

merged = pd.merge(sample_matches, geocoded_addresses, left_on="input_ext_id", right_on="ID EFA", how="inner")



display(HTML(merged.to_html()))

Unnamed: 0,input_ext_id,matches,input_name,input_lat,input_lng,ID EFA,full_address,longitude,latitude,result_score,result_score_next,result_label,result_type,result_id,result_housenumber,result_name,result_street,result_postcode,result_city,result_context,result_citycode,result_oldcitycode,result_oldcity,result_district,result_status
0,100932,[QDK4KY7WS74K],DARTY,48.944481,4.388121,100932,91 AV DU PRESIDENT ROOSEVELT 51470.0 SAINT MEMMIE,4.388121,48.944481,0.820607,,91 Avenue du Président Roosevelt 51470 Saint-Memmie,housenumber,51506_0365_00091,91.0,91 Avenue du Président Roosevelt,Avenue du Président Roosevelt,51470.0,Saint-Memmie,"51, Marne, Grand Est",51506,,,,ok
1,10779,[NBA5YVMJSM1F],LE GRAND REFECTOIRE,45.758843,4.836504,10779,3 COUR SAINT - HENRI 69002 LYON 02,4.836504,45.758843,0.869263,0.49419,3 Cour Saint-Henri 69002 Lyon,housenumber,69382_6380_00003,3.0,3 Cour Saint-Henri,Cour Saint-Henri,69002.0,Lyon,"69, Rhône, Auvergne-Rhône-Alpes",69382,,,Lyon 2e Arrondissement,ok
2,112878,[1ERMWZ6XBGQW],LIDL,51.026608,2.355444,112878,59 QUAI WILSON 59430.0 DUNKERQUE,2.355444,51.026608,0.827216,,Quai Wilson 59430 Dunkerque,street,59183_jw0xqw,,Quai Wilson,Quai Wilson,59430.0,Dunkerque,"59, Nord, Hauts-de-France",59183,59540.0,Saint-Pol-sur-Mer,,ok
3,137059,[57BQ2XJ7JSVQ],CENTRE ENDO NORD ISERE,45.59777,5.244406,137059,37 AVENUE DU MEDIPOLE 38300.0 BOURGOIN JALLIEU,5.244406,45.59777,0.922834,,37 Avenue du Médipôle 38300 Bourgoin-Jallieu,housenumber,38053_0615_00037,37.0,37 Avenue du Médipôle,Avenue du Médipôle,38300.0,Bourgoin-Jallieu,"38, Isère, Auvergne-Rhône-Alpes",38053,,,,ok
4,94891,[19KJ18M1FRMK],ESPACE DES MARQUES,46.704253,-1.431072,94891,56 RUE CLAUDE CHAPPE 85000.0 LA ROCHE SUR YON,-1.431072,46.704253,0.92366,0.477361,56 Rue claude chappe 85000 La Roche-sur-Yon,housenumber,85191_0541_00056,56.0,56 Rue claude chappe,Rue claude chappe,85000.0,La Roche-sur-Yon,"85, Vendée, Pays de la Loire",85191,,,,ok
5,110702,[XP2AXPVBX71G],RESIDENCE LES CANDELIES,45.919815,3.071044,110702,49 RUE ANTOINE FAUCHER 63140.0 CHATEL GUYON,3.071044,45.919815,0.919105,,49 Rue Antoine Faucher 63140 Châtel-Guyon,housenumber,63103_0030_00049,49.0,49 Rue Antoine Faucher,Rue Antoine Faucher,63140.0,Châtel-Guyon,"63, Puy-de-Dôme, Auvergne-Rhône-Alpes",63103,,,,ok
6,19120,[3FSZ4YM7GZ4J],HOPITAL PRIVE DU GRAND NARBONNE,43.187498,2.906523,19120,1 RUE DU PROFESSEUR CHRISTIAAN BARNARD 11100 MONTREDON DES CORBIERES,2.906523,43.187498,0.943579,,1 Rue du Professeur Christiaan Barnard 11100 Montredon-des-Corbières,housenumber,11255_0138_00001,1.0,1 Rue du Professeur Christiaan Barnard,Rue du Professeur Christiaan Barnard,11100.0,Montredon-des-Corbières,"11, Aude, Occitanie",11255,,,,ok
7,39869,[3S14YH57J4MR],EDF RENOUVELABLES FRANCE,48.890831,2.243493,39869,100 ESP DU GENERAL DE GAULLE 92400 COURBEVOIE,2.243493,48.890831,0.826538,,100 Esplanade Du Général De Gaulle 92400 Courbevoie,housenumber,92026_4138_00100,100.0,100 Esplanade Du Général De Gaulle,Esplanade Du Général De Gaulle,92400.0,Courbevoie,"92, Hauts-de-Seine, Île-de-France",92026,,,,ok
8,111364,[3ZZ4HRBP6DTA],MACIF,47.389177,0.697683,111364,2 RUE ALEXANDER FLEMING 37000.0 TOURS,0.697683,47.389177,0.916861,0.663232,2 Rue Alexander Fleming 37000 Tours,housenumber,37261_0097_00002,2.0,2 Rue Alexander Fleming,Rue Alexander Fleming,37000.0,Tours,"37, Indre-et-Loire, Centre-Val de Loire",37261,,,,ok
9,17596,[F3QCKR739EVG],SNIPES,45.444986,4.425027,17596,60 RUE EMILE ZOLA 42650 SAINT JEAN BONNEFONDS,4.425027,45.444986,0.95695,,60 Rue Émile Zola 42650 Saint-Jean-Bonnefonds,housenumber,42237_0260_00060,60.0,60 Rue Émile Zola,Rue Émile Zola,42650.0,Saint-Jean-Bonnefonds,"42, Loire, Auvergne-Rhône-Alpes",42237,,,,ok
