# 0 Download data from S3 

In [3]:
from notebooks.rapprochements.S3 import download_from_s3, upload_to_s3

data = ['bat_rnb_mesr.csv', 'bat_rnb_mesr_with_id.csv']
results = ['results-closest.json', 'results-guess.json']

def upload():
    for d in data:
        upload_to_s3(f'data/{d}', f'rapprochements/Enseignement supérieur/data/{d}')

    for r in results:
        upload_to_s3(f'results/{r}', f'rapprochements/Enseignement supérieur/results/{r}')

        
def download():
    for d in data:
        download_from_s3(f'rapprochements/Enseignement supérieur/data/{d}', f'data/{d}')

    for r in results:
        download_from_s3(f'rapprochements/Enseignement supérieur/results/{r}', f'results/{r}')


# fonction utilisée pour uploader les data et les résultats sur S3
#upload()

# Si vous n'avez pas encore les fichiers en local, vous pouvez les télécharger en lançant cette fonction.
download()

file is ready
file is ready
file is ready
file is ready


In [None]:
from notebooks.rapprochements.closest import find_closest_building
import csv
from pprint import pprint
import json

## 1 Creation d'un identifiant unique par ligne

In [None]:
with open("bat_rnb_mesr.csv", "r") as f, open("data/bat_rnb_mesr_with_id.csv", "w") as f_output:
    reader = csv.DictReader(f, delimiter=";")
    
    rapprochement_id = 1
    rows_output = []
    
    for row in reader:
        row["rapprochement_id"] = rapprochement_id
        rapprochement_id += 1
        rows_output.append(row)
    
    writer = csv.DictWriter(f_output, fieldnames=rows_output[0].keys() , delimiter=";")
    writer.writeheader()
    writer.writerows(rows_output)
        
        

#### Mini exploration du fichier input

In [None]:
import pandas as pd
import csv

with open("data/bat_rnb_mesr_with_id.csv", "r") as f:
    reader = csv.DictReader(f, delimiter=";")
    df = pd.DataFrame(reader)
    df

In [None]:
# liste de colonnes
list(df)

In [None]:
# Code bât/ter est il un identifiant unique ?
df.groupby('Code bât/ter')['Code bât/ter'].count().value_counts().sort_values(ascending=False)
# => pas trop

In [None]:
df['Code bât/ter'].nunique()

In [None]:
df['Code bât/ter'].count()

In [None]:
# un cas particulier
pd.set_option('display.max_columns', None)
df[df['Code bât/ter'] == '105864']

## 2 Rapprochement par closest building

In [None]:
with open("data/bat_rnb_mesr_with_id.csv", "r") as f:
    reader = csv.DictReader(f, delimiter=";")
    results = {}
    
    
    c = 0
    for row in reader:
        
        c += 1
        
        # print(c)
        
        id_es = row["rapprochement_id"]
        lat = row["lat"]
        lon = row["long"]
            
        url, closest_bdgs = find_closest_building(lat, lon, 20)            
        
        results[id_es] = {
            'query': url,
            'input': row,
            'result': closest_bdgs
        }
        
    with open('results/results-closest.json', 'w') as f:
        json.dump(results, f)
        
    

## 3 Rapprochement par guess

In [None]:
import csv
import json
from pprint import pprint
from notebooks.rapprochements.guess import guess_all

def guess():
    with open("data/bat_rnb_mesr_with_id.csv", "r") as f:
        reader = csv.DictReader(f, delimiter=";")
        all = []
        
        for i, line in enumerate(list(reader)):

            # Address
            address = [
                line["Adresse"],
                line["CP"],
                line["Ville"]
            ]
            address = [x for x in address if x != ""]

            # Point
            point = None
            if line["lat"] and line["long"]:
                point = f"{line['lat']},{line['long']}"

            all.append({
                "ext_id": line["rapprochement_id"],
                "name": line["Libellé bât/ter"],
                "address": " ".join(address),
                "point": point,
            })

        results = guess_all(all, avoid_throttling=True)
        json.dump(results, open("results/results_guess.json", "w"), indent=4)

In [None]:
guess()

   ## 4 Analyse

### Histogramme de la distance entre le batiment trouvé et les coordonnées trouvées

In [None]:
import json
import pandas as pd
# count 0 distance results

on_bdg_count = 0
with open('results/results-closest.json', 'r') as f:
    distances = []

    data = json.load(f).items()
    for key, row in data:
        
        result = row['result']
        
        if isinstance(result, dict):
            distances.append(result.get('distance', None))
        else:
            distances.append(None)
        

    df = pd.DataFrame(distances)
    
    total = len(distances)
    d_zero = (df[0] == 0).sum()
    d_max_one = (df[0] <= 1).sum()
    with_result = (df[0].notnull()).sum()
    
    print('-- Rows')
    print(total)
    
    print('-- Distance = 0')
    print(d_zero)
    print(f"{round(d_zero / total * 100, 2)}%")
    
    print('-- Distance <= 1')
    print(d_max_one)
    print(f"{round(d_max_one / total * 100, 2)}%")
    
    print('-- No results')
    print(total - with_result)
    
    
    print('-- Repartition')
    df.hist(bins=10)
    
    

### Quelques stats sur les batiments identifiés par closest

In [None]:
import pandas as pd
from urllib.parse import urlparse, parse_qs

def get_lat_lon_from_query(query):
    parsed_url = urlparse(query)
    query_params = parse_qs(parsed_url.query)
    lat, lon = query_params['point'][0].split(",")
    lat = float(lat)
    lon = float(lon)
    return lat, lon


def get_data(item):
    lat, lon = get_lat_lon_from_query(item['query'])
    #data = {'lat_input': lat, 'lon_input': lon}
    #if isinstance(item['result'], dict):
    #    data = {**data, **result}
    data = item.get('result', {})
    if data is None:
        data = {}
    data.update({'lat_input': lat, 'lon_input': lon})
    return data

with open('results/results-closest.json', 'r') as f:
    data = json.load(f)
    
    # Extract the IDs and 'result' values from each item
    result_values = [{'id': id, 'data': get_data(item)} for id, item in data.items()]

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(result_values)
    # split the unique column to a multiple columns
    df_closest = pd.json_normalize(df['data'])
    
    # je rajoute une colonne qui donne le nombre de fois que un rnb_id a été identifié
    # ce qui peut donner des indices sur de "mauvais" découpages de bâtiments 
    df_closest["count"] = df_closest.groupby('rnb_id')['rnb_id'].transform('count')
    df_closest

### occurences du même rnb_id dans les résultats

In [None]:
 print(df_closest[["rnb_id", "count"]].groupby("count").count())

### nombre de batiments identifiés avec distance à 0 et unique résultat

In [None]:
filtered_df = df_closest[(expanded_df["count"] == 1.0) & (df_closest["distance"] == 0.0)]
print(filtered_df.count())

### Comparaison des résultats exactes et fuzzy

In [None]:
def first_result(item):
    #print(item)
    if 'result' in item and len(item['result']) > 0:
        #print('result found')
        return item['result'][0]
    else:
        return {}

with open("results/results_guess.json") as f_guess:
    data = json.load(f_guess)
    #print(data["1"])
    result_values = [{'id': id, 'result': first_result(item)} for id, item in data.items()]
    df = pd.DataFrame(result_values)
    df_guess = pd.json_normalize(df['result'])
    

In [None]:
# merge the closest and guess results in a single dataframe
df_merge = df_closest.join(df_guess, how='inner', lsuffix='_closest', rsuffix='_guess')
df_merge

In [None]:
# count the number of case where the two approaches match
df_merge[df_merge['rnb_id_closest'] == df_merge['rnb_id_guess']]['rnb_id_closest'].count()

#### Cas où l'input tombe sur un batiment

In [None]:
# count the number of time input point is on the building
df_merge[df_merge['distance'] == 0.0]['rnb_id_closest'].count()

In [None]:
# count the number of time input point is on the building and the two approaches match
df_merge[(df_merge['distance'] == 0.0) & (df_merge['rnb_id_closest'] == df_merge['rnb_id_guess'])]['rnb_id_closest'].count()

In [None]:
# list cases where the input is on the building, but guess said something else
pd.set_option('display.max_columns', None)
df_why_guess_different = df_merge[(df_merge['distance'] == 0.0) & (df_merge['rnb_id_closest'] != df_merge['rnb_id_guess'])]
df_why_guess_different.count()

In [None]:
def diff_geojson(df):
    geojson = {
        "type": "FeatureCollection",
        "features": []
    }

    for i, row in df.iterrows():
        #print(row)
        input_feature = {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": [row["lon_input"], row["lat_input"]]},
            "properties": {"type": "input", "id": i, **row},
        }

        closest_feature = {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": row["point.coordinates_closest"]},
            "properties": {"type": "closest", "rnb_id": row["rnb_id_closest"]},
        }

        guess_feature = {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": row["point.coordinates_guess"]},
            "properties": {"type": "guess", "rnb_id": row["rnb_id_guess"]},
        }

        line_feature_closest = {
            "type": "Feature",
            "geometry": {
            "type": "LineString",
            "coordinates": [
                [row["lon_input"], row["lat_input"]],
                row["point.coordinates_closest"]
            ]},
            "properties": {},

        }

        line_feature_guess = {
            "type": "Feature",
            "geometry": {
            "type": "LineString",
            "coordinates": [
                [row["lon_input"], row["lat_input"]],
                row["point.coordinates_guess"]
            ]},
            "properties": {},
        }

        geojson["features"].append(input_feature)
        geojson["features"].append(closest_feature)    
        geojson["features"].append(guess_feature)    
        geojson["features"].append(line_feature_closest)    
        geojson["features"].append(line_feature_guess)    


    return json.dumps(geojson).replace("NaN", '"NaN"').replace("\'", "'")

diff_geojson(df_why_guess_different)

#### différence guess / closest cas général (l'input tombe sur un batiment ou pas)

In [None]:
# number of cases where closest gave no results
df_merge[(df_merge['rnb_id_closest'] != df_merge['rnb_id_guess']) & (df_merge['rnb_id_closest'].isna())]['lat_input'].count()

In [None]:
# list cases where the input is on the building, but guess said something else
df_why_guess_different_all = df_merge[(df_merge['rnb_id_closest'] != df_merge['rnb_id_guess']) & (df_merge['rnb_id_closest'].notna()) & (df_merge['rnb_id_guess'].notna())]
df_why_guess_different_all

diff_geojson(df_why_guess_different_all)