In [88]:
import pandas as pd
import glob
import csv
import numpy as np
import os
from collections import Counter
import math

def valeur_plus_frequente_STR(liste):
    # Filtrer les valeurs qui sont des chaînes de caractères et différentes de None
    valeurs_str = [valeur for valeur in liste if (isinstance(valeur, str) or (isinstance(valeur, float) and not math.isnan(valeur))) and valeur is not None and valeur != "nan" and valeur != 0.0]
    # Vérifier si la liste filtrée est vide
    if not valeurs_str:
        return None
    # Utiliser Counter pour compter le nombre d'occurrences de chaque élément
    compteur = Counter(valeurs_str)
    # Utiliser max() avec une fonction lambda pour obtenir l'élément avec le plus grand nombre d'occurrences
    valeur_plus_frequente = max(compteur, key=compteur.get)
    return valeur_plus_frequente

# Définir une fonction pour nettoyer chaque valeur de la colonne
def clean_string(encoded_string):
    if isinstance(encoded_string, str):  # Vérifiez si la valeur est une chaîne de caractères
        # Décoder la chaîne depuis l'UTF-8 avec 'replace' pour gérer les caractères non décodables
        decoded_string = encoded_string.encode('latin1').decode('utf-8', errors='replace')
        # Réencoder la chaîne en latin1
        cleaned_string = decoded_string.encode('latin1')
    else:
        return encoded_string  # Retourner la valeur telle quelle si elle n'est pas une chaîne de caractères

def detect_csv_separator(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        dialect = csv.Sniffer().sniff(csvfile.read(1024))  # Reads the first 1024 bytes to detect the delimiter
        return dialect.delimiter

In [96]:
#caracteristique
# Get a list of all CSV files in the directory

files = glob.glob('../Downloads/car*.csv')

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()
# Loop through each file and append its data to the combined DataFrame
for file in files:
    print(file)
    # Read the first row of the file to infer data types
    separator = detect_csv_separator(file)
    #dtypes = pd.read_csv(file, nrows=1,encoding='ansi').dtypes.to_dict()
    print(separator)
    # Read the entire CSV file using inferred data types
    df = pd.read_csv(file, encoding='ansi', sep=separator, quotechar='"')

    combined_data = pd.concat([combined_data, df], ignore_index=True)
    
# Now 'combined_data' contains data from all CSV files
combined_data['Num_Acc'] = combined_data['Num_Acc'].fillna(combined_data['Accident_Id'])
combined_data = combined_data.drop(columns=['Accident_Id'])

# Supprimer les colonnes "gps" et "dep" et "com"
combined_data.drop(columns=['gps', 'dep', 'com'], inplace=True)
column_types = df.dtypes
print(column_types)
float_columns = combined_data.select_dtypes(include=['float']).columns
print('float_columns '+float_columns)
combined_data[float_columns] = combined_data[float_columns].fillna(0)
combined_data[float_columns] = combined_data[float_columns].astype(np.int64)

string_columns = combined_data.select_dtypes(include=['object']).columns
print('string_columns '+string_columns)

adr_freq = str(valeur_plus_frequente_STR(combined_data['adr']))
print('valeur_plus_frequente adr '+ adr_freq)
combined_data['adr'].fillna(value=adr_freq, inplace=True)

lat_freq = str(valeur_plus_frequente_STR(combined_data['lat']))
print('valeur_plus_frequente lat '+ lat_freq)
combined_data['lat'].fillna(value=lat_freq, inplace=True)

long_freq = str(valeur_plus_frequente_STR(combined_data['long']))
print('valeur_plus_frequente long '+ long_freq)
combined_data['long'].fillna(value=long_freq, inplace=True)

combined_data.to_csv('../csv/caracteristiques.csv', sep='|', index=False)
print('end caracteristiques')

../Downloads\caracteristiques-2017.csv
,
../Downloads\caracteristiques-2018.csv
,
../Downloads\caracteristiques-2019.csv
;
../Downloads\caracteristiques-2020.csv
;
../Downloads\carcteristiques-2021.csv
;
../Downloads\carcteristiques-2022.csv
;
Accident_Id     int64
jour            int64
mois            int64
an              int64
hrmn           object
lum             int64
dep            object
com            object
agg             int64
int             int64
atm             int64
col             int64
adr            object
lat            object
long           object
dtype: object
Index(['float_columns Num_Acc', 'float_columns atm', 'float_columns col'], dtype='object')
Index(['string_columns hrmn', 'string_columns adr', 'string_columns lat',
       'string_columns long'],
      dtype='object')
valeur_plus_frequente adr AUTOROUTE A86
valeur_plus_frequente lat 4800000.0
valeur_plus_frequente long 228547.0
end caracteristiques


In [106]:

#usagers
# Get a list of all CSV files in the directory
files = glob.glob('../Downloads/usagers*.csv')

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()

# Loop through each file and append its data to the combined DataFrame
for file in files:
    print(file)
    # Read the first row of the file to infer data types
    separator = detect_csv_separator(file)
    #dtypes = pd.read_csv(file, nrows=1).dtypes.to_dict()
    print(separator)
    # Read the entire CSV file using inferred data types
    df = pd.read_csv(file, encoding='ansi', sep=separator, quotechar='"')

    combined_data = pd.concat([combined_data, df], ignore_index=True)
    
# Now 'combined_data' contains data from all CSV files

float_columns = combined_data.select_dtypes(include=['float']).columns
print('float_columns '+float_columns)
combined_data[float_columns] = combined_data[float_columns].fillna(0)
combined_data[float_columns] = combined_data[float_columns].astype(np.int64)
combined_data['actp']
string_columns = combined_data.select_dtypes(include=['object']).columns
print('string_columns '+string_columns)

combined_data[string_columns] = combined_data[string_columns].fillna(value='na')

combined_data.to_csv('../csv/usagers.csv', sep='|', index=False)

print('end Usager')



../Downloads\usagers-2017.csv
,
../Downloads\usagers-2018.csv
,
../Downloads\usagers-2019.csv
;
../Downloads\usagers-2020.csv
;
../Downloads\usagers-2021.csv
;
../Downloads\usagers-2022.csv
;
Index(['float_columns place', 'float_columns trajet', 'float_columns secu',
       'float_columns locp', 'float_columns etatp', 'float_columns an_nais',
       'float_columns secu1', 'float_columns secu2', 'float_columns secu3'],
      dtype='object')
Index(['string_columns actp', 'string_columns num_veh',
       'string_columns id_vehicule', 'string_columns id_usager'],
      dtype='object')
end Usager


In [102]:
#vehicules
# Get a list of all CSV files in the directory
files = glob.glob('../Downloads/vehicules*.csv')

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()

# Loop through each file and append its data to the combined DataFrame
for file in files:
    print(file)
    # Read the first row of the file to infer data types
    separator = detect_csv_separator(file)
    #dtypes = pd.read_csv(file, nrows=1).dtypes.to_dict()
    print(separator)
    # Read the entire CSV file using inferred data types
    df = pd.read_csv(file, encoding='ansi', sep=separator, quotechar='"')
    combined_data = pd.concat([combined_data, df], ignore_index=True)

# Now 'combined_data' contains data from all CSV files
combined_data['num_veh'] = combined_data['num_veh'].fillna(combined_data['id_vehicule'])
combined_data = combined_data.drop(columns=['id_vehicule'])

float_columns = combined_data.select_dtypes(include=['float']).columns
combined_data[float_columns] = combined_data[float_columns].fillna(0)
combined_data[float_columns] = combined_data[float_columns].astype(np.int64)
string_columns = combined_data.select_dtypes(include=['object']).columns
combined_data[string_columns] = combined_data[string_columns].fillna(value='na')

combined_data.to_csv('../csv/vehicules.csv', sep='|',  index=False)
print('end vehicules')

../Downloads\vehicules-2017.csv
,
../Downloads\vehicules-2018.csv
,
../Downloads\vehicules-2019.csv
;
../Downloads\vehicules-2020.csv
;
../Downloads\vehicules-2021.csv
;
../Downloads\vehicules-2022.csv
;
end vehicules


In [103]:

#lieux
# Get a list of all CSV files in the directory
files = glob.glob('../Downloads/lieux*.csv')

# Initialize an empty DataFrame to store the combined data
combined_data = pd.DataFrame()

# Loop through each file and append its data to the combined DataFrame
for file in files:
    print(file)
    # Read the first row of the file to infer data types
    separator = detect_csv_separator(file)
    #dtypes = pd.read_csv(file, nrows=1).dtypes.to_dict()
    print(separator)
    # Read the entire CSV file using inferred data types
    df = pd.read_csv(file, encoding='ansi', sep=separator, quotechar='"',low_memory=False)
    df['voie'] = df['voie'].astype(str)
    # Fill missing values with a specific value, e.g., 'Unknown'
    df['voie'].fillna('Unknown', inplace=True)
    
    combined_data = pd.concat([combined_data, df], ignore_index=True)

# Now 'combined_data' contains data from all CSV files
# Supprimer les colonnes "v1" et "v2" et "env1"
combined_data.drop(columns=['v1', 'v2', 'env1', 'pr' , 'pr1', 'nbv'], inplace=True)

float_columns = combined_data.select_dtypes(include=['float']).columns
print(float_columns)
combined_data[float_columns] = combined_data[float_columns].fillna(0)
combined_data[float_columns] = combined_data[float_columns].astype(np.int64)
string_columns = combined_data.select_dtypes(include=['object']).columns
print(string_columns)

voie_freq = str(valeur_plus_frequente_STR(combined_data['voie']))
print('valeur_plus_frequente voie '+ voie_freq)
combined_data['voie'].fillna(value=voie_freq, inplace=True)

lartpc_freq = str(valeur_plus_frequente_STR(combined_data['lartpc']))
print('valeur_plus_frequente lartpc '+ lartpc_freq)
combined_data['lartpc'].fillna(value=lartpc_freq, inplace=True)

larrout_freq = str(valeur_plus_frequente_STR(combined_data['larrout']))
print('valeur_plus_frequente larrout '+ larrout_freq)
combined_data['larrout'].fillna(value=larrout_freq, inplace=True)

combined_data.to_csv('../csv/lieux.csv', sep='|',  index=False)
print('end lieux')

../Downloads\lieux-2017.csv
,
../Downloads\lieux-2018.csv
,
../Downloads\lieux-2019.csv
;
../Downloads\lieux-2020.csv
;
../Downloads\lieux-2021.csv
;
../Downloads\lieux-2022.csv
;
Index(['circ', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ', 'vma'], dtype='object')
Index(['voie', 'lartpc', 'larrout'], dtype='object')
valeur_plus_frequente voie 0
valeur_plus_frequente lartpc 15.0
valeur_plus_frequente larrout  -1
end lieux
