# Tertiaire

In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
usecols = ['N°DPE', 'Identifiant__BAN', 'Code_postal_(BAN)', 'Code_postal_(brut)']

df_tertiaire = pd.read_csv('notebooks/rapprochements/DPE/data_oct_2024/dpe-v2-tertiaire-2.csv', sep=',', usecols=usecols)
df_tertiaire['file'] = 'tertiaire'

df_neuf = pd.read_csv('notebooks/rapprochements/DPE/data_oct_2024/dpe-v2-logements-neufs.csv', sep=',', usecols=usecols)
df_neuf['file'] = 'logement-neuf'

df_existant = pd.read_csv('notebooks/rapprochements/DPE/data_oct_2024/dpe-v2-logements-existants.csv', sep=',', usecols=usecols)
df_existant['file'] = 'logement-existant'

df = pd.concat([df_tertiaire, df_neuf, df_existant])

In [3]:
df.shape

(10701835, 5)

In [4]:
df.head()

Unnamed: 0,N°DPE,Identifiant__BAN,Code_postal_(BAN),Code_postal_(brut),file
0,2363T1680837R,63124_0035_00024_ter,63800.0,63800,tertiaire
1,2269T2615953W,69228_0086_00707,69440.0,69440,tertiaire
2,2357T4209637Z,57540_0300_00004,57370.0,57370,tertiaire
3,2360T0127880L,60073_0075_00007,60650.0,60650,tertiaire
4,2271T0492563C,71309_0070_00014,71710.0,71710,tertiaire


In [5]:
# create 100 subfiles in the dpe_existant folder, using the number of rows in the dataframe
for i in range(0, 100):
    df.iloc[i*df.shape[0]//100:(i+1)*df.shape[0]//100].to_csv(f'notebooks/rapprochements/DPE/sub_files/dpe-{i}.csv', index=False)

In [None]:
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
from django.db import connection
from concurrent.futures import ThreadPoolExecutor
import numpy as np

def get_rnb_id(row):
    cursor = connection.cursor()

    if row['Code_postal_(BAN)'] != row['Code_postal_(brut)']:
        return []
    
    ban_id = row['Identifiant__BAN']

    sql = f"""
            with rnb_ids as (
            select
                rnb_id
            from
                batid_buildingaddressesreadonly bb
            left join batid_building bb2 on
                bb2.id = bb.building_id
            where
                address_id = '{ban_id}'
                and ST_AREA(shape::geography) > 25)
            select
                array_agg(rnb_id)
            from
                rnb_ids;
    """
    cursor.execute(sql)
    result = cursor.fetchone()
    return result[0] if result[0] is not None else []

def execute(df):
    df_copy = df.copy()
    df_copy['rnb_id'] = df_copy.apply(get_rnb_id, axis=1)
    return df_copy


def process_sub_file(i):
    print(f"processing file {i}")
    if not os.path.exists(f'notebooks/rapprochements/DPE/sub_files_results/dpe-{i}-result.csv'):
        df_sub_file = pd.read_csv(f'notebooks/rapprochements/DPE/sub_files/dpe-{i}.csv', sep=',')
        max_workers = 50
        dfs = np.array_split(df_sub_file, max_workers)

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results_sub_file = executor.map(execute, dfs)
            df_result = pd.concat(results_sub_file)
            df_result.to_csv(f'notebooks/rapprochements/DPE/sub_files_results/dpe-{i}-result.csv', index=False)

for i in range(100):
    process_sub_file(i)

In [3]:
dfs_result = []

for i in range(100):
    df_result = pd.read_csv(f'notebooks/rapprochements/DPE/sub_files_results/dpe-{i}-result.csv', usecols=['N°DPE', 'file', 'rnb_id'])
    dfs_result.append(df_result)

df_final = pd.concat(dfs_result)
df_final['rnb_id'] = df_final['rnb_id'].apply(eval)


In [None]:
df_final.to_csv('notebooks/rapprochements/DPE/results_DPE_RNB.csv', index=False)

In [4]:
df_final['n'] = df_final['rnb_id'].apply(len)
df_final.head()

Unnamed: 0,N°DPE,file,rnb_id,n
0,2363T1680837R,tertiaire,[3HVCQ22D89TQ],1
1,2269T2615953W,tertiaire,"[4YHBR2QC67QF, ESW1SCP3YRKF, QZW45SDXBJDF, XQE...",10
2,2357T4209637Z,tertiaire,[Z5QNQZGKQ63V],1
3,2360T0127880L,tertiaire,"[SV4KCWDCK333, K37C6J5ZA9FT, 386R58WKYE81]",3
4,2271T0492563C,tertiaire,[N7BSTQWD61YE],1


In [5]:
print("tertiaire")
print(len(df_final[(df_final['n'] == 1) & (df_final['file'] == 'tertiaire')]) / len(df_final[df_final['file'] == 'tertiaire']), "% rapproché")
print(len(df_final[df_final['file'] == 'tertiaire']), "lignes")

tertiaire
0.489017469541 % rapproché
323878 lignes


In [6]:
print("neuf")
print(len(df_final[(df_final['n'] == 1) & (df_final['file'] == 'logement-neuf')]) / len(df_final[df_final['file'] == 'logement-neuf']), "% rapproché")
print(len(df_final[df_final['file'] == 'logement-neuf']), "lignes")

neuf
0.2427297740650072 % rapproché
916237 lignes


In [7]:
print("existant")
print(len(df_final[(df_final['n'] == 1) & (df_final['file'] == 'logement-existant')]) / len(df_final[df_final['file'] == 'logement-existant']), "% rapproché")
print(len(df_final[df_final['file'] == 'logement-existant']), "lignes")

existant
0.5911936730319646 % rapproché
9461720 lignes


In [9]:
print("total")
print(len(df_final[(df_final['n'] == 1)]) / len(df_final), "% rapproché avec succès")
print(len(df_final[(df_final['n'] == 1)]), 'ligne rapprochées avec succes')
print(len(df_final), "lignes")

total
0.5582677176390778 % rapproché avec succès
5974489 ligne rapprochées avec succes
10701835 lignes


: 