## Intro

### Scoring Logic Description

1. We start by importing the odis dataframe from a CSV that includes all the relevant datapoint to score and display data
2. We compute the scores for each criteria specific to the commune (independant from subject)
3. We compute the scores for each criteria specific to the subject (dependand from both subject and commune) 
4. We identify all commune<->neighbour pairs (binômes) for each commune within search radius
5. We compute category scores (emploi, logement, education etc...) as an average of the all the scores for a given category
6. For each commune we compare the commune and neighbour category scores and weighted the highest one with category weights defined by subject and then keep the best weighted score for each commune
8. We display result in on a map

In [1]:
# THIS SHOULD BE THE END OF JUPYTER NOTEBOOK EXPORT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from scipy import stats
import folium as flm #required for gdf.explore()
import shapely as shp
from shapely.wkt import loads
from shapely.geometry import Polygon
from sklearn import preprocessing

## 1. Fetching key indicators from ODIS source file

In [2]:
def init_loading_datasets(odis_file, scores_cat_file, metiers_file, formations_file, ecoles_file):
    odis = gpd.GeoDataFrame(gpd.read_parquet(odis_file))
    odis.set_geometry(odis.polygon, inplace=True)
    odis = odis[~odis.polygon.isna()]

    # Index of all scores and their explanations
    scores_cat = pd.read_csv(scores_cat_file)

    #Later we need the code FAP <-> FAP Name used to classify jobs
    codfap_index = pd.read_csv(metiers_file, delimiter=';')

    # Later we need the code formation <-> Formation Name used to classify trainings
    # source: https://www.data.gouv.fr/fr/datasets/liste-publique-des-organismes-de-formation-l-6351-7-1-du-code-du-travail/
    codformations_index = pd.read_csv(formations_file).set_index('codformation')

    # Etablissements scolaires
    annuaire_ecoles = pd.read_parquet(ecoles_file)
    annuaire_ecoles.geometry = annuaire_ecoles.geometry.apply(shp.from_wkb)

    return odis, scores_cat, codfap_index, codformations_index, annuaire_ecoles

## 2. Distance filter + Gathering nearby Communes Scores

In [3]:
# Filtering dataframe based on subject distance preference (to save on compute time later on)
def filter_loc_by_distance(df, distance):
    return df[df.dist_current_loc < distance * 1000]

# Put None as a score in the monome case
def monome_cleanup(df):
    mask = ~df['binome']
    for col in df.columns:
        if col.endswith('_binome'):
            df.loc[mask, col] = None
    return df

In [4]:
def adding_score_voisins(df_search, scores_cat):
    #df_search is the dataframe pre-filtered by location
    #df_source is the dataframe with all the communes
    binome_columns = ['codgeo','libgeo','polygon','epci_code','epci_nom']+scores_cat[scores_cat.incl_binome]['score'].to_list()+scores_cat[scores_cat.incl_binome]['metric'].to_list()
    df_binomes = df_search[binome_columns].copy()

    # Adds itself to list of voisins = monome case
    # Note: this code triggers the SettingWithCopyWarning but I don't know how to fix it...
    df_search.codgeo_voisins = df_search.apply(lambda x: np.append(x.codgeo_voisins, x.codgeo), axis=1)

    # Explodes the dataframe to have a row for each voisins + itself
    df_search['codgeo_voisins_copy'] = df_search['codgeo_voisins']
    df_search_exploded = df_search.explode('codgeo_voisins_copy')
    
    # For each commune (codgeo) in search area (df_search) we add all its voisin's scores
    odis_search_exploded = pd.merge(df_search_exploded, df_binomes.add_suffix('_binome'), left_on='codgeo_voisins_copy', right_on='codgeo_binome', how='left')
    
    # Adds a column to identify binomes vs monomes + cleanup
    odis_search_exploded['binome'] = odis_search_exploded.apply(lambda x: False if x.codgeo == x.codgeo_binome else True, axis=1)
    odis_search_exploded.drop(columns={'codgeo_voisins_copy'}, inplace=True)

    #We remove all values for the monome case to avoid accounting for them in the category score calculation
    odis_search_exploded = monome_cleanup(odis_search_exploded)

    return odis_search_exploded

## 3. Criterias Scoring

In [5]:
#Computing distance from current commune 
#Using a crs that allows to compute distance in meters for metropolitan France

def distance_calc(df, ref_point):
    return int(df.distance(ref_point))

def add_distance_to_current_loc(df, current_codgeo):
    projected_crs = "EPSG:2154"
    zone_recherche = gpd.GeoDataFrame(df[df.codgeo == current_codgeo]['polygon'])
    zone_recherche.set_geometry('polygon', inplace=True)
    zone_recherche.to_crs(projected_crs, inplace=True)

    df.to_crs(projected_crs, inplace=True)
    df['dist_current_loc'] = df['polygon'].apply(distance_calc, ref_point=zone_recherche.iloc[0].polygon)
    return df

In [6]:
#Adding score specific to subject looking for a job identified as en besoin
def codes_match(df, codes_list):
    #returns a list of codfaps that matches
    if df is None:
        return []
    return list(set(df.tolist()).intersection(set(codes_list)))

def fap_names_lookup(df):
    return list(codfap_index[codfap_index['Code FAP 341'].isin(df)]['Intitulé FAP 341'])

In [7]:
def compute_criteria_scores(df, prefs): 
    df = df.copy()
    df['met_ratio']= 1000 * df.met/df.pop_be
    #met_tension_ratio est le ratio d'offres population de la zone (pour 1000 habitants)
    df['met_tension_ratio'] = 1000 * df.met_tension/df.pop_be

    #svc_ratio est le ratio de services d'inclusion de la commune (pour 1000 habitants)
    df['svc_incl_ratio'] = 1000 * df.svc_incl_count/df.pop_be

    #log_vac_ratio est le ratio de logements vacants de la commune % total logements
    df['log_vac_ratio'] = df.log_vac/df.log_total

    #log_5p+_ratio est le ratio de residences principales de 5 pièces ou plus % total residences principales

    df['log_5p_ratio'] = df['rp_5+pieces']/df.log_rp

    # Risque de fermeture école: ratio de classe à risque de fermeture % nombre d'écoles
    df['risque_fermeture_ratio'] = df.risque_fermeture/df.ecoles_ct

    #Scaling with PowerTransformer so that 
    # 1. outliers don't impact too much the end result
    # 2. all scores are normaly distributed and centered around 0
    #pt = preprocessing.PowerTransformer()
    t = preprocessing.QuantileTransformer(output_distribution="uniform")
    df['met_scaled'] = t.fit_transform(df[['met_ratio']].fillna(0))
    df['met_tension_scaled'] = t.fit_transform(df[['met_tension_ratio']].fillna(0))
    df['svc_incl_scaled'] = t.fit_transform(df[['svc_incl_ratio']].fillna(0))
    df['log_vac_scaled'] = t.fit_transform(df[['log_vac_ratio']].fillna(0))
    df['log_5p_scaled'] = t.fit_transform(df[['log_5p_ratio']].fillna(0))
    df['classes_ferm_scaled'] = t.fit_transform(df[['risque_fermeture_ratio']].fillna(0))
    df['pol_scaled'] = df[['pol_num']].astype('float')
    # Let's create subject-specific scores
    t = preprocessing.QuantileTransformer(output_distribution="uniform")
    #For each adult we look for jobs categories that match what is needed
    i=1
    for adult in prefs['codes_metiers']:
        df['met_match_codes_adult'+str(i)] = df.be_codfap_top.apply(codes_match, codes_list=prefs['codes_metiers'][adult])
        df['met_match_adult'+str(i)] = df['met_match_codes_adult'+str(i)].apply(len)
        df['met_match_adult'+str(i)+'_scaled'] = t.fit_transform(df[['met_match_adult'+str(i)]].fillna(0))
        i+=1

    j=1
    for adult in prefs['codes_formations']:
        df['form_match_codes_adult'+str(j)] = df.codes_formations.apply(codes_match, codes_list=prefs['codes_formations'][adult])
        df['form_match_adult'+str(j)] = df['form_match_codes_adult'+str(j)].apply(len)
        df['form_match_adult'+str(j)+'_scaled'] = t.fit_transform(df[['form_match_adult'+str(j)]].fillna(0))
        j+=1

    # We compute the distance from the current location 
    df['reloc_dist_scaled'] = (1-df['dist_current_loc']/(prefs['loc_distance_km']*1000))
    df['reloc_epci_scaled'] = np.where(df['epci_code'] == df[df.codgeo == prefs['commune_actuelle']]['epci_code'].iloc[0],1,0)
    
    return df

## 4. Category Scoring

In [8]:
def compute_cat_scores(df, scores_cat, penalty):
    df = df.copy()
    df_binome = pd.DataFrame()
    columns_in_use = set(df.columns) & set(scores_cat.score)
    columns_in_use_binome = set(df.columns) & set([score+'_binome' for score in scores_cat.score])
    for cat in set(scores_cat.cat):
        cat_scores_indices = [score for score in scores_cat[scores_cat['cat'] == cat]['score'] if score in columns_in_use]
        cat_scores_indices_binome = [score+'_binome' for score in scores_cat[scores_cat['cat'] == cat]['score'] if score+'_binome' in columns_in_use_binome]

        # Efficiently select all relevant rows at once
        cat_scores_df = df[cat_scores_indices]
        for col in cat_scores_indices_binome:
            mask = df[col].notna()
            df_binome[col] = pd.to_numeric(df[col], errors='coerce')
            df_binome.loc[mask, col] = df.loc[mask, col] * (1-penalty) 
            cat_scores_df = pd.concat([cat_scores_df, df_binome[col]], axis=1)
        df[cat + '_cat_score'] = 100 * cat_scores_df.astype(float).mean(axis=1)

    return df

## 5. Final Binome Score Weighted

In [9]:
def compute_binome_score_old(df, binome_penalty, prefs):
    scores_col = [col for col in df.columns if col.endswith('_cat_score')]
    max_scores = pd.DataFrame()
    
    for col in scores_col:
        cat_weight = prefs[col.split('_')[0]]
        max_scores[col] = cat_weight * np.where(
            df[col] >= (1-binome_penalty)*df[col+'_binome'],
            df[col],
            (1-binome_penalty)*df[col+'_binome']
            )
    
    return max_scores.mean(axis=1).round(1)


In [10]:
def compute_binome_score(df, prefs):
    scores_cat_col = [col for col in df.columns if col.endswith('_cat_score')]
    weighted_scores = pd.DataFrame()
    
    for col in scores_cat_col:
        cat_weight = prefs[col.split('_')[0]]
        weighted_scores[col] = cat_weight * df[col]
    
    return weighted_scores.astype(float).mean(axis=1)

In [11]:
def best_score_compute(df):
    #Keeping the best (top #1) monome or binome result for each commune
    best = df.sort_values('weighted_score', ascending=False).groupby('codgeo').head(1)
    return best

In [12]:
#Main function that aggregates most of the above in one sequence
def compute_odis_score(df, scores_cat, prefs):
    df = add_distance_to_current_loc(df, current_codgeo=prefs['commune_actuelle'])

    # We filter by distance to reduce the compute cost on a smaller odis_search dataframe
    odis_search = filter_loc_by_distance(df, distance=prefs['loc_distance_km'])

    # We compute the subject specific scores
    odis_scored = compute_criteria_scores(odis_search, prefs=prefs)

    # We add the criteria scores for all neighbor communes forming monomes and binomes
    odis_exploded = adding_score_voisins(odis_scored, scores_cat)

    # We compute the category scores for both the target and the binome
    odis_exploded = compute_cat_scores(odis_exploded, scores_cat=scores_cat, penalty=prefs['binome_penalty'])

    # We computing the final weighted score for all commune<->voisin combinations
    odis_exploded['weighted_score'] = compute_binome_score(odis_exploded, prefs=prefs)

    # We keep best monome or binome for each commune 
    odis_search_best = best_score_compute(odis_exploded)

    return odis_search_best


## 6. Generating Narrative

Here we want to generate a 'human readable' explanation about why scored high a given location.
Things to show:
- Target commune name and EPCI
- Weighted Score
- If Binome, show the binome and EPCI if different from target
- Show top 3 criterias target (weighted ?) 
- Show top 3 criterias for binome (weighted ?)

In [13]:
def produce_pitch(df, prefs, scores_cat, codfap_index, codformations_index):
    pitch_lines = []
    pitch_lines += [df.loc['libgeo'] +' dans l\'EPCI: '+ df.loc['epci_nom']]
    pitch_lines += ['Le score est de: '+str(df.loc['weighted_score'])]
    if df.loc['binome']:
        pitch_lines += ['Ce score est obtenu en binome avec la commune '+df.loc['libgeo_binome']]
        if df.loc['epci_code'] != df.loc['epci_code_binome']:
            pitch_lines += ['Cette commune est située dans l\'EPCI: '+df.loc['epci_nom_binome']]
    else:
        pitch_lines += ['Ce score est obtenu sans commune binôme']

    
    #Adding the top contributin criterias
    crit_scores_col = [col for col in df.index if '_scaled' in col]#col.endswith('_scaled')]
    
    df_sorted=df[crit_scores_col].dropna().sort_values(ascending=False)
    for i in range(0, 5):
        score = df_sorted.index[i][:-7] if df_sorted.index[i].endswith('_binome') else df_sorted.index[i]
        score_name = scores_cat[scores_cat.score == score]['score_name'].item()
        pitch_lines += ['Le critère #'+str(i+1)+' est: '+score_name+' avec un score de: '+str(df_sorted.iloc[i])]

    
    #Adding the matching job families if any
    metiers_col = [col for col in df.index if col.startswith('met_match_codes')]
    for metiers_adultx in metiers_col:
        matched_codfap_names = list(codfap_index[codfap_index['Code FAP 341'].isin(df.loc[metiers_adultx])]['Intitulé FAP 341'])
        if len(matched_codfap_names) == 1:
            pitch_lines += ['La famille de métiers '+ matched_codfap_names[0] +' est rechechée'] 
        elif len(matched_codfap_names) >= 1:
            list_jobs = ''
            for job in matched_codfap_names:
                list_jobs += job+', '
            pitch_lines += ['Les familles de métiers '+ list_jobs[:-2] +' sont rechechées']
    
    return pitch_lines

In [17]:
def produce_pitch_markdown(df, prefs, scores_cat, codfap_index, codformations_index):
    pitch_md = []
    pitch_md.append(f'**{df.loc['libgeo']}** dans l\'EPCI: {df.loc['epci_nom']}\n')
    pitch_md.append(f'Le score est de: **{str(df.loc['weighted_score'])}**\n')
    if df.loc['binome']:
        pitch_md.append(f'Ce score est obtenu en binome avec la commune {df.loc['libgeo_binome']}')
        if df.loc['epci_code'] != df.loc['epci_code_binome']:
            pitch_md.append(f' située dans l\'EPCI: {df.loc['epci_nom_binome']}\n')
    else:
        pitch_md += f'Ce score est obtenu sans commune binôme'

    
    #Adding the top contributing criterias
    crit_scores_col = [col for col in df.index if '_scaled' in col]#col.endswith('_scaled')]
    
    df_sorted=df[crit_scores_col].dropna().sort_values(ascending=False)
    for i in range(0, 5):
        score = df_sorted.index[i][:-7] if df_sorted.index[i].endswith('_binome') else df_sorted.index[i]
        score_name = scores_cat[scores_cat.score == score]['score_name'].item()
        pitch_md.append(f'- Le critère #{str(i+1)} est: **{score_name}** avec un score de: **{str(df_sorted.iloc[i])}**\n')

    
    #Adding the matching job families if any
    pitch_md.append('**Emploi**\n') 
    metiers_col = [col for col in df.index if col.startswith('met_match_codes')]
    matched_codfap_names = []
    for metiers_adultx in metiers_col:
        matched_codfap_names += list(codfap_index[codfap_index['Code FAP 341'].isin(df.loc[metiers_adultx])]['Intitulé FAP 341'])
    matched_codfap_names = set(matched_codfap_names)
    if len(matched_codfap_names) == 0:
        pitch_md.append(f'Aucun des métiers recherchés ne figurent dans le Top 10 des métiers à pourvoir sur cette zone.\n')
    if len(matched_codfap_names) == 1:
        pitch_md.append(f'- La famille de métiers ** {matched_codfap_names[0]} ** est rechechée \n')
    elif len(matched_codfap_names) >= 1:
        list_jobs = ''
        for job in matched_codfap_names:
            list_jobs += job+', '
        pitch_md.append(f'- Les familles de métiers **{list_jobs[:-2]}** sont rechechées \n')
        
    return pitch_md

SyntaxError: f-string: unmatched '[' (1287997909.py, line 3)

In [15]:
# THIS SHOULD BE THE END OF JUPYTER NOTEBOOK EXPORT

## 8. Export to Python file for streamlit

In [16]:
%save -f -r ../streamlit/odisscoring.py 1-15
# This saves the cells 0 to 22 (and their execution history unfortunately) to a python file that I can use in Streamlit
# Make sure to restart before running this cell
# Don't forget to restart streamlit after this

The following commands were written to file `../streamlit/odisscoring.py`:
# THIS SHOULD BE THE END OF JUPYTER NOTEBOOK EXPORT
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from scipy import stats
import folium as flm #required for gdf.explore()
import shapely as shp
from shapely.wkt import loads
from shapely.geometry import Polygon
from sklearn import preprocessing
def init_loading_datasets(odis_file, scores_cat_file, metiers_file, formations_file, ecoles_file):
    odis = gpd.GeoDataFrame(gpd.read_parquet(odis_file))
    odis.set_geometry(odis.polygon, inplace=True)
    odis = odis[~odis.polygon.isna()]

    # Index of all scores and their explanations
    scores_cat = pd.read_csv(scores_cat_file)

    #Later we need the code FAP <-> FAP Name used to classify jobs
    codfap_index = pd.read_csv(metiers_file, delimiter=';')

    # Later we need the code formation <-> Formation Name used to classify trainings
    # source: https://www.da

In [60]:
# Restart and run all the cells above this one

## Notebook explorations

In [None]:
# Init
#init_libraries()

ODIS_FILE = '../csv/odis_april_2025_jacques.parquet'
SCORES_CAT_FILE = '../csv/odis_scores_cat.csv'
METIERS_FILE = '../csv/dares_nomenclature_fap2021.csv'
FORMATIONS_FILE = '../csv/index_formations.csv'
ECOLES_FILE = '../csv/annuaire_ecoles_france_mini.parquet'

odis, scores_cat, codfap_index, codformations_index, annuaire_ecoles = init_loading_datasets(
    odis_file=ODIS_FILE,
    scores_cat_file=SCORES_CAT_FILE,
    metiers_file=METIERS_FILE,
    formations_file=FORMATIONS_FILE,
    ecoles_file=ECOLES_FILE
    )


In [None]:
# Subject preferences weighted score computation
prefs = {
    'emploi':2,
    'logement':1,
    'education':1,
    'soutien':1,
    'mobilité':0,
    'commune_actuelle':'33281',
    'loc_distance_km':500,
    'codes_metiers':{
        'codes_metiers_adulte1':['S1X40','J0X33','A1X41'],
        'codes_metiers_adulte2':['T4X60','T2A60']
    },
    'codes_formations':{
        'codes_formations_adulte1':[423],
        'codes_formations_adulte2':[315,100]
    },
    'age_enfants':{
        'age_enfant1':4,
        'age_enfant2':10,
        'age_enfant3':None,
        'age_enfant4':None,
        'age_enfant5':None
    },
    'binome_penalty':0.1
}

In [19]:
# Step by Step Execution
from time import time

def performance_tracker(t, text, timer_mode):
    if timer_mode:
        print(str(round(time()-t,2))+'|'+text)
        return time()
t = time()
timer_mode = True

df = odis
score_cat = scores_cat
prefs = prefs
#
df = add_distance_to_current_loc(df, current_codgeo=prefs['commune_actuelle'])
t = performance_tracker(t, 'Add Distance End', timer_mode)

# We filter by distance to reduce the compute cost on a smaller odis_search dataframe
odis_search = filter_loc_by_distance(df, distance=prefs['loc_distance_km'])
t = performance_tracker(t, 'Filter Loc by Distance', timer_mode)

# We compute the subject specific scores
odis_scored = compute_criteria_scores(odis_search, prefs=prefs)
t = performance_tracker(t, 'Compute Subject Score End', timer_mode)

# We add the criteria scores for all neighbor communes forming monomes and binomes
odis_exploded = adding_score_voisins(odis_scored, scores_cat)
t = performance_tracker(t, 'Adding Score Voisin', timer_mode)

# We compute the category scores for both the target and the binome
odis_exploded = compute_cat_scores(odis_exploded, scores_cat=scores_cat, penalty=prefs['binome_penalty'])
t = performance_tracker(t, 'Compute Cat Score End', timer_mode)

# We computing the final weighted score for all commune<->voisin combinations
odis_exploded['weighted_score'] = compute_binome_score(odis_exploded, prefs=prefs)
t = performance_tracker(t, 'Compute Weighted Score End', timer_mode)

# We keep best monome or binome for each commune 
odis_search_best = best_score_compute(odis_exploded)
t = performance_tracker(t, 'Compute Best Score End', timer_mode)


1.6|Add Distance End
0.0|Filter Loc by Distance
0.11|Compute Subject Score End
0.06|Adding Score Voisin




0.1|Compute Cat Score End
0.02|Compute Weighted Score End
0.01|Compute Best Score End


In [260]:
odis_search_best[~odis_search_best.binome].head()


Unnamed: 0,codgeo,libgeo,typecom,reg_code,reg_nom,dep_code,dep_nom,epci_code,epci_nom,niveau_equipements_services,...,risque_fermeture_ratio_binome,met_match_adult1_binome,met_match_adult2_binome,binome,logement_cat_score,mobilité_cat_score,education_cat_score,soutien_cat_score,emploi_cat_score,weighted_score
166,16057,Bouteville,COM,75,Nouvelle-Aquitaine,16,Charente,200070514,CA du Grand Cognac,0.0,...,,,,False,98.453762,4.9855,100.0,44.069069,41.449783,65.084479
6349,33396,Saint-Étienne-de-Lisse,COM,75,Nouvelle-Aquitaine,33,Gironde,200035533,CC du Grand Saint-Émilionnais,0.0,...,,,,False,95.595596,30.418,100.0,0.0,59.834835,63.053053
5885,33328,Pomerol,COM,75,Nouvelle-Aquitaine,33,Gironde,200070092,CA du Libournais,0.0,...,,,,False,56.561564,33.9945,78.678679,37.912913,59.834835,58.564565
7800,40215,Ousse-Suzan,COM,75,Nouvelle-Aquitaine,40,Landes,244000691,CC du Pays Morcenais,0.0,...,,,,False,60.518136,3.974,100.0,39.83984,46.062729,58.496687
1410,17152,Épargnes,COM,75,Nouvelle-Aquitaine,17,Charente-Maritime,241700640,CA Royan Atlantique,0.0,...,,,,False,73.093162,12.8415,78.678679,46.121121,45.328662,57.710057


In [222]:
odis_exploded['weighted_score'] = compute_binome_score(odis_exploded, prefs=prefs)
odis_exploded['weighted_score']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
257   NaN
258   NaN
259   NaN
260   NaN
261   NaN
Name: weighted_score, Length: 262, dtype: float64

In [217]:
odis_exploded

Unnamed: 0,codgeo,libgeo,typecom,reg_code,reg_nom,dep_code,dep_nom,epci_code,epci_nom,niveau_equipements_services,...,log_5p_ratio_binome,risque_fermeture_ratio_binome,met_match_adult1_binome,met_match_adult2_binome,binome,emploi_cat_score,logement_cat_score,mobilité_cat_score,education_cat_score,soutien_cat_score
0,33013,Artigues-près-Bordeaux,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,2.0,...,0.541127,3.000000,1,2,True,59.857774,22.045486,56.795,,51.443415
1,33013,Artigues-près-Bordeaux,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,2.0,...,0.488789,3.000000,1,2,True,59.857774,13.600667,56.795,,52.711918
2,33013,Artigues-près-Bordeaux,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,2.0,...,0.237933,2.571429,1,2,True,59.396063,17.342812,56.795,,55.589616
3,33013,Artigues-près-Bordeaux,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,2.0,...,0.162542,2.888889,1,2,True,59.501168,15.070191,56.795,,57.916730
4,33013,Artigues-près-Bordeaux,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,2.0,...,0.154327,3.000000,1,2,True,59.857774,10.901214,56.795,,58.096502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,33550,Villenave-d'Ornon,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,3.0,...,0.377520,3.000000,1,2,True,58.845000,10.099740,82.405,,53.894568
258,33550,Villenave-d'Ornon,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,3.0,...,0.512268,2.400000,1,2,True,58.278183,15.418924,82.405,,54.882438
259,33550,Villenave-d'Ornon,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,3.0,...,0.355415,2.636364,1,2,True,58.400351,8.780973,82.405,,59.439321
260,33550,Villenave-d'Ornon,COM,75,Nouvelle-Aquitaine,33,Gironde,243300316,Bordeaux Métropole,3.0,...,0.187113,2.764706,1,2,True,58.456597,10.379816,82.405,,59.184391


In [None]:

penalty=0.1
df = odis_exploded.copy()
df_binome = pd.DataFrame()
columns_in_use = set(df.columns) & set(scores_cat.score)
columns_in_use_binome = set(df.columns) & set([score+'_binome' for score in scores_cat.score])
for cat in set(scores_cat.cat):
    cat_scores_indices = [score for score in scores_cat[scores_cat['cat'] == cat]['score'] if score in columns_in_use]
    cat_scores_indices_binome = [score+'_binome' for score in scores_cat[scores_cat['cat'] == cat]['score'] if score+'_binome' in columns_in_use_binome]

    # Efficiently select all relevant rows at once
    cat_scores_df = df[cat_scores_indices]
    for col in cat_scores_indices_binome:
        mask = df[col].notna()
        df_binome[col] = pd.to_numeric(df[col], errors='coerce')
        df_binome.loc[mask, col] = df.loc[mask, col] * (1-penalty) 
        cat_scores_df = pd.concat([cat_scores_df, df_binome[col]], axis=1)
    df[cat + '_cat_score'] = 100 * cat_scores_df.astype(float).mean(axis=1)


In [214]:
df.head(1).to_csv('../csv/test.csv')

In [None]:




# We provide the scores columns as a parameter to compute faster
#scores_col = [col for col in odis_exploded.columns if col.endswith('_cat_score')]

# We computing the final weighted score for all commune<->voisin combinations
# odis_exploded['weighted_score'] = compute_binome_score(
#     odis_exploded,
#     binome_penalty=prefs['binome_penalty'],
#     weights=prefs
#     )

# We keep best monome or binome for each commune 
odis_search_best = best_score_compute(odis_exploded)

In [25]:
# Showing results on an interactive map
cols_to_show = (
        ['codgeo','libgeo','weighted_score','binome','libgeo_binome','dist_current_loc','polygon']
        +[col for col in odis_search_best.columns if '_codes_' in col]
        +[col for col in odis_search_best.columns if '_cat_score' in col]
        )
#odis_search_best[cols_to_show].explore('weighted_score', popup=True)
#odis_search_best.plot('weighted_score')
#odis_search_best[cols_to_show].explore('weighted_score', tooltip=True)

In [62]:
scores_cat

Unnamed: 0,score,score_name,metric,unit,tooltip,cat,incl_binome,high_value_adj
0,met_scaled,Taux Besoin Emploi,met_ratio,besoin/1000 hab,Emplois non pourvus pour 1000 habitants (Sourc...,emploi,True,élevé
1,met_tension_scaled,Taux Besoin Emploi en Tension,met_tension_ratio,besoin/1000 hab,Emplois en tension non pourvus pour 1000 habit...,emploi,True,élevé
2,svc_incl_scaled,Taux Services Inclusion,svc_incl_ratio,services/1000 hab,Services d'inclusions pour 1000 habitants,soutien,True,élevé
3,log_vac_scaled,Taux Logements Vacants,log_vac_ratio,/Total Logements,,logement,True,élevé
4,log_5p_scaled,Taux Grandes Résidences Principales,log_5p_ratio,/Total Résidences Principales,,logement,True,élevé
5,classes_ferm_scaled,Taux Classes à Risque de Fermeture,risque_fermeture_ratio,/Total Ecoles,Ratio du nombre de classes à riques de fermetu...,emploi,True,élevé
6,pol_scaled,Couleur Politique de la commmue,pol_num,,Affiliation à un parti politique en faveur de ...,soutien,False,favorable
7,met_match_adult1_scaled,Match compétences et Besoin Emploi Adult 1,met_match_adult1,familles d'emplois,Nombre de familles d'emplois dans le top 10,emploi,True,présent
8,met_match_adult2_scaled,Match compétences et Besoin Emploi Adult 2,met_match_adult2,familles d'emplois,Nombre de familles d'emplois dans le top 10,emploi,True,présent
9,form_match_adult1_scaled,Match compétences et Centres de formation,form_match_adult1,sites de formations,Nombre centres de formations dans la commune,emploi,False,présent


## 9. Export to SuperSet

In [27]:
# def concatenate_strings(row):
#   return '{"type": "Feature","geometry":' + shp.to_geojson(row['polygon']) + '}'


# odis_search_best_export = gpd.GeoDataFrame(odis_search_best.copy())
# odis_search_best_export.set_geometry(odis_search_best_export.polygon, crs='EPSG:2154', inplace=True)
# odis_search_best_export.to_crs(epsg=4326, inplace=True)
# odis_search_best_export["polygon_as_json"] = odis_search_best_export.apply(concatenate_strings, axis=1)
# odis_search_best_export.drop(['polygon','polygon_binome'], axis=1, inplace=True)

# cols = ['met_match_codes','met_match_codes_binome','be_codfap_top','be_libfap_top','codgeo_voisins_binome','pitch']
# for col in cols:
#     odis_search_best_export[col] = odis_search_best_export[col].apply(lambda x: x.tolist() if type(x) == np.ndarray else x)

In [28]:
# from sqlalchemy import create_engine, text

# db_host = "localhost"  # Replace with the actual host (e.g., 'superset_db' if in the same Docker network, or 'localhost' if exposed)
# db_port = "5433"  # Replace with the actual port (usually 5432)
# db_user = "superset"  # Replace with the database user (often 'superset')
# db_password = "superset"  # Replace with the database password
# db_name = "examples"  # Replace with the database name (often 'superset')

# engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

In [29]:
# table_name = "odis_stream2_result"  # Choose a name for the table in PostgreSQL
# odis_search_best_export.to_sql(table_name, engine, if_exists='replace', schema='public', index=False)
# sql = text("GRANT SELECT ON odis_stream2_result TO examples")

# with engine.begin() as connection:
#     connection.execute(sql)

# print(f"DataFrame successfully written to table '{table_name}' in the Superset database.")

Note to myself:
Après avoir importé les données dans Postgres il faut donner les droits au user 'examples' sur la table
> docker exec -it superset_db psql -h superset_db -p 5432 -U superset -d examples

> GRANT SELECT ON odis_stream2_result TO examples;

> GRANT USAGE ON SCHEMA public TO examples;
