# Imports

In [1]:
import pandas as pd
import reverse_geocoder as rg
import io
import datetime

# Traitement des données

In [2]:
france: pd.DataFrame = pd.read_csv("datas/caracteristics.csv", dtype={"long": str})

france.dropna(subset=["lat", "long"], inplace=True)

# Restrict our datas to metropolitan France
france = france[france.lat != 0]
france = france[france.long != 0]
france = france[france.long != "-"]
france = france[france.gps == "M"]

# Convert the coordinates into readable ones
france.long = pd.to_numeric(france.long)
france.long = france.long / 100000
france.lat = france.lat / 100000

france.an += 2000

# Create a tuple with the coordinates lat | long => (lat, long)
france["coordinates"] = france[["lat", "long"]].apply(tuple, axis=1)

def extract_time(x):
    """
    Convert the string date from the caracteristics dataset into a datetime instance
    """
    year = x[0]
    month = x[1]
    day = x[2]
    time = str(x[3])
    time = time.zfill(4)

    hour = str(time)[:1]
    minutes = str(time)[2:]

    return datetime.datetime(year, month, day, int(hour), int(minutes))
france["datetime"] = france[["an", "mois", "jour", "hrmn"]].apply(extract_time, axis=1)

def parse_lighting_conditions(x):
    """
    Convert the int lighting conditions from the caracteristics dataset into a nominal variable
    """
    if x == 1:
        return "Full day"
    elif x == 2:
        return "Twilight or dawn"
    elif x == 3:
        return "Night without public lighting"
    elif x == 4:
        return "Night with public lighting not lit"
    elif x == 5:
        return "Night with public lighting on"
france["lum_str"] = france.lum.apply(parse_lighting_conditions)

# Sort the dataframe by dates
france = france.sort_values(["datetime"])
france.reset_index(inplace=True)

france

Unnamed: 0,index,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,com,adr,gps,lat,long,dep,coordinates,datetime,lum_str
0,753526,200500000568,2005,1,1,700,3,1,9,1.0,1.0,451.0,,M,43.26700,-0.07300,640,"(43.267, -0.073)",2005-01-01 00:00:00,Night without public lighting
1,754594,200500001636,2005,1,1,200,3,1,1,1.0,6.0,211.0,,M,49.09000,2.10700,950,"(49.09, 2.107)",2005-01-01 00:00:00,Night without public lighting
2,754479,200500001521,2005,1,1,200,5,1,9,1.0,6.0,77.0,,M,43.23600,2.67600,110,"(43.236, 2.676)",2005-01-01 00:00:00,Night with public lighting on
3,753652,200500000694,2005,1,1,500,5,2,1,1.0,6.0,128.0,CD28,M,43.30000,1.46000,310,"(43.3, 1.46)",2005-01-01 00:00:00,Night with public lighting on
4,753799,200500000841,2005,1,1,600,3,1,1,1.0,7.0,268.0,,M,44.31600,4.55800,70,"(44.316, 4.558)",2005-01-01 00:00:00,Night without public lighting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240356,16212,201600016213,2016,12,31,1850,4,2,1,1.0,3.0,55.0,"198, RABATAU (BOULEVARD)",M,43.28224,5.39944,130,"(43.28224, 5.39944)",2016-12-31 01:50:00,Night with public lighting not lit
240357,45247,201600045248,2016,12,31,2000,4,1,1,1.0,2.0,22.0,A86 EXT,M,48.77640,2.42026,940,"(48.7764, 2.42026)",2016-12-31 02:00:00,Night with public lighting not lit
240358,52732,201600052733,2016,12,31,2110,5,2,1,1.0,6.0,110.0,"53, RUE LOUIS BLANC",M,48.88310,2.36138,750,"(48.8831, 2.36138)",2016-12-31 02:10:00,Night with public lighting on
240359,345,201600000346,2016,12,31,2030,3,1,1,5.0,6.0,547.0,D91,M,49.99964,2.59458,800,"(49.99964, 2.59458)",2016-12-31 02:30:00,Night without public lighting


# Validation de la position des accidents

In [3]:
# Verify that all positions are in the France territory
france_informations = pd.read_csv("datas/rg_france.csv")
france_informations[["lat", "lon", "name", "admin1", "admin2", "cc"]].to_csv("datas/rg_france_min.csv", index=False)
france_informations

Unnamed: 0,geonameid,name,asciiname,alternatenames,lat,lon,feature class,feature code,cc,cc2,admin1,admin2,admin3,admin4,population,elevation,dem,timezone,modification date
0,2659086,Col de Recon,Col de Recon,Rapenaz Col de;Recon Col de,46.30352,6.82838,T,PASS,FR,CH,84,74,744,74058,0,,1733,Europe/Paris,2019-02-15
1,2659815,Lucelle,Lucelle,La Lucelle Riviere;La Lucelle Rivière;Lucelle;...,47.41667,7.50000,H,STM,FR,,00,,,,0,,353,Europe/Paris,2014-08-05
2,2659933,Les Cornettes de Bise,Les Cornettes de Bise,Cornettes de Bise;Les Cornettes de Bise,46.33263,6.78458,T,PK,FR,CH,84,74,744,74058,0,2432.0,2355,Europe/Paris,2019-02-16
3,2659943,Lertzbach,Lertzbach,Le Lertzbach Ruisseau;Lertzbach;Ruisseau le Le...,47.60479,7.54665,H,STM,FR,CH,00,,,,0,,242,Europe/Paris,2018-11-06
4,2659973,Le Cheval Blanc,Le Cheval Blanc,Le Cheval Blanc,46.05193,6.87276,T,MT,FR,CH,84,74,742,74273,0,2831.0,2814,Europe/Paris,2019-03-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167871,12358519,Église de La Trinité,Eglise de La Trinite,,47.79111,1.06834,S,RLG,FR,,24,41,412,41269,0,,109,Europe/Paris,2021-10-15
167872,12358520,Prieuré Saint-Gilles-du-Verger,Prieure Saint-Gilles-du-Verger,,47.46580,-0.55480,S,RLG,FR,,52,49,491,49007,0,,41,Europe/Paris,2021-10-15
167873,12358521,Église Saint-Pierre,Eglise Saint-Pierre,,47.21927,-0.72674,S,RLG,FR,,52,49,492,49092,0,,88,Europe/Paris,2021-10-15
167874,12358522,Abbaye de Saint-Florent-lès-Saumur,Abbaye de Saint-Florent-les-Saumur,,47.26541,-0.10271,S,RLG,FR,,52,49,493,49328,0,,28,Europe/Paris,2021-10-15


## Quelles sont les lignes du dataset n'étant pas sur le terrain français
L'output de cette opération renvoi un dataframe vide, cela signifie que tous nos points, même ceux qui ont des coordonnées invalides (accidents s'étant passés dans la mer/océan) sont considérés comme s'étant produit en France. Après avoir utilisé plusieurs méthodes afin de retirer ces points, il n’est manifestement pas possible de le faire simplement. On a alors laissé ce code de validation dans le notebook afin d'avoir une certaine validation des points. Cependant ces accidents ne représentent qu'un minuscule pourcentage du total, il n'est donc pas réellement dérangeant de les considérés pour nos statistiques, car ils n'auront pratiquement aucun impact.

In [4]:
# Retreive all the rows that are not in France

geo = rg.RGeocoder(
    mode=2,
    verbose=True,
    stream=io.StringIO(open("datas/rg_france_min.csv", encoding="utf-8").read())
)

countries = pd.DataFrame(geo.query(france.coordinates.to_list()))
countries.drop(["lat", "lon"], axis=1, inplace=True)

france = pd.concat([france, countries], axis=1)
france[france.cc != "FR"]

Unnamed: 0,index,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,...,lat,long,dep,coordinates,datetime,lum_str,name,admin1,admin2,cc


# Sauvegarde des caractéristiques post-process

In [5]:
france.to_csv("datas/caracteristics_complete.csv")