In [2]:
import pandas as pd

# Chemin vers le fichier CSV
file_path = './MAP/data/parcelles/parcelles_distances_nearest.csv'

# Lire le fichier CSV
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,id,nom_com,idu,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,


In [3]:
# Afficher le nombre de lignes du DataFrame
num_rows = df.shape[0]
print(f"Nombre de lignes : {num_rows}")

Nombre de lignes : 50000


In [None]:
# Renommer la colonne 'idu' en 'info_parc'
df.rename(columns={"idu": "info_parc"}, inplace=True)

            df.head()

Unnamed: 0,id,nom_com,info_parc,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,


In [5]:
# Afficher les valeurs manquantes par colonne
missing_values = df.isnull().sum()
print(missing_values)

id                                0
nom_com                           0
info_parc                         0
contenance                        0
min_distance_meters_enedis        0
centroid_x                        0
centroid_y                        0
min_distance_meters_route         0
nearest_route_id                  0
highway                           0
name                          24410
dtype: int64


In [6]:
# Définir une fonction pour attribuer un score en fonction de la distance Enedis
def score_distance(distance_ENEDIS):
    if distance_ENEDIS > 1000:
        return 0
    elif 750 <= distance_ENEDIS <= 999.999999:
        return 1
    elif 500 <= distance_ENEDIS <= 749.999999:
        return 2
    elif 300 <= distance_ENEDIS <= 499.999999:
        return 3
    elif 100 <= distance_ENEDIS <= 299.999999:
        return 4
    elif 0 <= distance_ENEDIS <= 99.999999:
        return 5
    else:
        return None  # Pour gérer les valeurs en dehors de la plage spécifiée

# Appliquer la fonction de scoring à la colonne 'min_distance_meters_enedis'
df['score_min_distance'] = df['min_distance_meters_enedis'].apply(score_distance)

# Afficher les premières lignes du DataFrame avec les scores
df.head()

Unnamed: 0,id,nom_com,info_parc,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name,score_min_distance
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,,5
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet,5
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,,5
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai,5
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,,5


In [7]:
# Filtrer les lignes avec des valeurs NaN dans 'score_min_distance'
nan_rows = df[df['score_min_distance'].isnull()][['min_distance_meters_enedis', 'score_min_distance']]

# Afficher les lignes filtrées
print(nan_rows)

Empty DataFrame
Columns: [min_distance_meters_enedis, score_min_distance]
Index: []


In [8]:
# Définir une fonction pour attribuer un score en fonction de la distance pour 'min_distance_meters_route'
def score_distance_route(distance_route):
    if distance_route > 1000:
        return 0
    elif 750 <= distance_route <= 999.999999:
        return 1
    elif 400 <= distance_route <= 749.999999:
        return 2
    elif 300 <= distance_route <= 499.999999:
        return 3
    elif 100 <= distance_route <= 299.999999:
        return 4
    elif 0 <= distance_route <= 99.999999:
        return 5
    else:
        return None  # Pour gérer les valeurs en dehors de la plage spécifiée

# Appliquer la fonction de scoring à la colonne 'min_distance_meters_route'
df['score_min_distance_route'] = df['min_distance_meters_route'].apply(score_distance_route)

In [9]:
df.head()

Unnamed: 0,id,nom_com,info_parc,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name,score_min_distance,score_min_distance_route
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,,5,5
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet,5,5
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,,5,5
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai,5,5
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,,5,5


In [10]:
# Vérifier s'il y a des valeurs NaN dans la colonne 'score_min_distance_route'
nan_count_route = df['score_min_distance_route'].isnull().sum()
print(f"Nombre de valeurs NaN dans la colonne 'score_min_distance_route' : {nan_count_route}")

Nombre de valeurs NaN dans la colonne 'score_min_distance_route' : 0


In [11]:
# Définir une fonction pour attribuer un score en fonction de la contenance
def score_contenance(contenance):
    if contenance <= 3999 or contenance > 6000:
        return 0
    elif 4000 <= contenance <= 4300:
        return 1
    elif 4301 <= contenance <= 4700:
        return 2
    elif 4701 <= contenance <= 5200:
        return 3
    elif 5201 <= contenance <= 5600:
        return 4
    elif 5601 <= contenance <= 6000:
        return 5
    else:
        return None  # Pour gérer les valeurs en dehors de la plage spécifiée

# Appliquer la fonction de scoring à la colonne 'contenance'
df['score_contenance'] = df['contenance'].apply(score_contenance)

In [12]:
# Vérifier s'il y a des valeurs NaN dans la colonne 'score_contenance'
nan_count_contenance = df['score_contenance'].isnull().sum()
print(f"Nombre de valeurs NaN dans la colonne 'score_contenance' : {nan_count_contenance}")

# Afficher quelques exemples de lignes contenant des valeurs NaN dans 'score_contenance'
nan_examples_contenance = df[df['score_contenance'].isnull()][['contenance', 'score_contenance']]
print(nan_examples_contenance.head())

Nombre de valeurs NaN dans la colonne 'score_contenance' : 0
Empty DataFrame
Columns: [contenance, score_contenance]
Index: []


In [13]:
df.head()

Unnamed: 0,id,nom_com,info_parc,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name,score_min_distance,score_min_distance_route,score_contenance
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,,5,5,4
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet,5,5,0
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,,5,5,0
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai,5,5,4
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,,5,5,0


In [14]:
import os

# Afficher les noms des colonnes pour vérifier leur exactitude
print("Colonnes disponibles dans le dataset :")
print(df.columns)

# Vérifier les colonnes de score disponibles
score_columns = ['score_min_distance', 'score_min_distance_route', 'score_contenance']
available_score_columns = [col for col in score_columns if col in df.columns]

if len(available_score_columns) < len(score_columns):
    missing_columns = set(score_columns) - set(available_score_columns)
    print(f"Les colonnes suivantes sont manquantes dans le dataset : {missing_columns}")

# Calculer la moyenne des colonnes de score disponibles
if available_score_columns:
    df['score_final'] = df[available_score_columns].mean(axis=1)
    # Créer le répertoire s'il n'existe pas
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    # Sauvegarder les données avec la nouvelle colonne
    df.to_csv(file_path, index=False)
    print('Scores recalculés et fichier mis à jour.')
else:
    print("Aucune des colonnes de score spécifiées n'est disponible dans le dataset.")

Colonnes disponibles dans le dataset :
Index(['id', 'nom_com', 'info_parc', 'contenance',
       'min_distance_meters_enedis', 'centroid_x', 'centroid_y',
       'min_distance_meters_route', 'nearest_route_id', 'highway', 'name',
       'score_min_distance', 'score_min_distance_route', 'score_contenance'],
      dtype='object')
Scores recalculés et fichier mis à jour.


In [15]:
df.head()

Unnamed: 0,id,nom_com,info_parc,contenance,min_distance_meters_enedis,centroid_x,centroid_y,min_distance_meters_route,nearest_route_id,highway,name,score_min_distance,score_min_distance_route,score_contenance,score_final
0,parcelle.40079477,Marseille,132169100C0180,5567,16.771689,43.350899,5.349609,0.0,445307145,service,,5,5,4,4.666667
1,parcelle.40083492,Marseille,132058190A0110,10588,0.0,43.292014,5.399027,0.0,260500778,service,Allée Fraissinet,5,5,0,3.333333
2,parcelle.40083501,Marseille,132098460C0196,18520,0.0,43.236016,5.412616,0.0,1193176178,service,,5,5,0,3.333333
3,parcelle.40083533,Marseille,13212877AE0121,5596,0.0,43.306014,5.441691,0.0,27773399,residential,Traverse de Courtrai,5,5,4,4.666667
4,parcelle.40083539,Marseille,132118700H0156,96129,0.0,43.28313,5.459582,0.0,157678655,service,,5,5,0,3.333333


In [16]:
# Chemin du fichier JSON
json_file_path = file_path.replace('.csv', '.json')

# Enregistrer le DataFrame en format JSON
df.to_json(json_file_path, orient='records', lines=True)

print(f"Fichier JSON enregistré à : {json_file_path}")

Fichier JSON enregistré à : ./MAP/data/parcelles/parcelles_distances_nearest.json


In [17]:
# Compter le nombre de lignes où score_final est supérieur à 3
count_above_3 = df[df['score_final'] > 3].shape[0]
print(f"Nombre de lignes avec un score_final supérieur à 3 : {count_above_3}")

Nombre de lignes avec un score_final supérieur à 3 : 34997
