In [4]:
import os
import pandas as pd
import requests
import json
from pathlib import Path
from datetime import datetime
import time
import chardet

In [5]:
current_path = Path(os.getenv('WORKING_DIR')) if os.getenv('WORKING_DIR') else Path().absolute()
current_path

PosixPath('/Users/geoffreyaldebert/etalab/projects/analysis/irve-analysis')

In [6]:
own_datasets = ['54231d4a88ee38334b5b9e1d', '5448d3e0c751df01f85d0572']
def parse_api(url):
    r = requests.get(url)
    data = r.json()
    nb_pages = int(data['total']/data['page_size'])+1
    arr = []
    for i in range(1,nb_pages+1):
        r = requests.get(url+"&page="+str(i))
        data = r.json()
        for dataset in data['data']:
            for res in dataset['resources']:
                if dataset['id'] not in own_datasets: #dataset Etalab
                    if 'format=csv' in res['url']:
                        filename = res['url'].split('/')[-3] + '.csv'
                    else:
                        filename = res['url'].split('/')[-1]
                    ext = filename.split('.')[-1]
                    obj = {}
                    obj['dataset_id'] = dataset['id']
                    obj['dataset_slug'] = dataset['slug']
                    obj['dataset_page'] = dataset['page']
                    obj['resource_id'] = res['id']
                    obj['resource_title'] = res['title']
                    obj['resource_last_modified'] = res['last_modified']
                    if ext != 'csv':
                        obj['error_type'] = "wrong-file-format"
                    else:
                        if not dataset['organization'] and not dataset['owner']:
                            obj['error_type'] = "orphan-dataset"
                        else:
                            obj['organization_or_owner'] = dataset['organization']['slug'] if dataset['organization'] else dataset['owner']['slug']
                            obj['error_type'] = None
                    arr.append(obj)
    df = pd.DataFrame(arr)
    return df

## Récupération via tags

In [7]:
url = "https://www.data.gouv.fr/api/1/datasets/?tag=irve"

In [8]:
df = parse_api(url)

## Récupération via metadata schema

In [9]:
url = "https://www.data.gouv.fr/api/1/datasets/?schema=etalab/schema-irve"

In [10]:
df2 = parse_api(url)

## Récupération via search

In [11]:
url = "https://www.data.gouv.fr/api/1/datasets/?q=recharge+v%C3%A9hicules+%C3%A9lectriques"
df3 = parse_api(url)
url = "https://www.data.gouv.fr/api/1/datasets/?q=irve"
df4 = parse_api(url)

## Concaténation et dédoublonnage

In [12]:
df = pd.concat([df, df2, df3, df4], ignore_index=True)

In [13]:
df = df.drop_duplicates(subset=['resource_id'], keep='first')

In [14]:
df.shape

(491, 8)

In [15]:
df[df['error_type'].isna()].shape

(291, 8)

## Téléchargement resources

In [19]:
data_path = current_path / 'data' / datetime.now().strftime('%Y%m%d')
data_path.mkdir(parents=True, exist_ok=True)

In [None]:
total_resources = 0
dl_resources = 0

downloaded = []

for index,row in df[df['error_type'].isna()].iterrows():
    rurl = "https://www.data.gouv.fr/fr/datasets/r/"+row['resource_id']
    r = requests.get(rurl, allow_redirects=True) 
    
    p = Path(data_path) / row['dataset_slug']
    p.mkdir(exist_ok=True)
    written_filename = '%s.csv' % (row['resource_id'])
                                       
    with open('%s/%s' % (p, written_filename), 'wb') as f:
        dl_resources += 1
        f.write(r.content)
        downloaded.append(row)
        print('✅ downloaded file [%s] %s' % (row['resource_title'], rurl))

## Analyse via Validata

In [20]:
listOfFiles = list()
for (dirpath, dirnames, filenames) in os.walk(data_path):
    listOfFiles += [os.path.join(dirpath, file) for file in filenames]

In [21]:
len(listOfFiles)

292

In [22]:
def recalibrating_v1(path):
    modified = 0
    df = pd.read_csv(path,sep=None, engine='python',dtype=str)
    #Traitements v1
    columns_mapping = {
        'n_amenageur': ['nom_amenageur', 'n_amenageu'],
        'n_operateur': ['n_operateu','nom_operateur'],
        'n_enseigne': ['nom_enseigne'],
        'id_station': ['id'],
        'n_station': ['nom_station','station'],
        'ad_station': ['adresse_station'],
        'code_insee': ['insee'],
        'Xlongitude': ['longitude_wsg84'],
        'Ylatitude': ['latitude_wsg84'],
        'nbre_pdc': ['nbre_borne'],
        'id_pdc': ['n° borne'],
        'puiss_max': ['puissance_max','puissance_maximale'],
        'type_prise': ['type_connecteur','typ_charge'],
        'acces_recharge': ['modalité d\'accès à la borne','acces_rech'],
        'accessibilité': ['accessibilitã©','accessibilite','accessibilit�','accessibilit','accessibilit'],
        'observations': ['obs'],
        'date_maj': ['date_mise_a_jour']
    }
    for col in df.columns:
        for key in columns_mapping:
            if col in columns_mapping[key]:
                df = df.rename(columns={col:key})
                modified = 1
    df.to_csv(path,index=False)

In [23]:
schema_v2_url = 'https://schema.data.gouv.fr/schemas/etalab/schema-irve/2.0.0/schema.json'
schema_v1_url = 'https://schema.data.gouv.fr/schemas/etalab/schema-irve/1.0.3/schema.json'
validata_api = 'https://api.validata.etalab.studio/validate'

In [24]:
arr = []
cpt = 0
for f in listOfFiles:
    try:
        cpt +=1
        print("Processing "+str(cpt)+" : "+f.replace('.csv','').replace('csv/','').split("/")[-1])
        obj = {}
        obj['resource_id'] = f.replace('.csv','').replace('csv/','').split("/")[-1]
        obj['local_file'] = f
        file = {'file': open(f,'rb')}
        data = {'schema': schema_v2_url}
        time.sleep(1)
        r = requests.post(validata_api, files=file, data=data)
        if(r.json()['report']['valid']):
            obj['version'] = '2.0.0'
        else:
            recalibrating_v1(f)
            time.sleep(1)
            file = {'file': open(f,'rb')}
            data = {'schema': schema_v1_url}
            r = requests.post(validata_api, files=file, data=data)
            if(r.json()['report']['valid']):
                obj['version'] = '1.0.3'
            else:
                obj['error_type'] = 'validation'
        arr.append(obj)
    except:
        obj = {}
        obj['resource_id'] = f.replace('.csv','').replace('csv/','').split("/")[-1]
        obj['local_file'] = f
        obj['error_type'] = 'read'
        arr.append(obj)
        pass


Processing 1 : 0d971ee6-7dff-497e-a950-0d2b0662b96c
Processing 2 : 198eaabe-5f9c-4f91-998a-b3c3cb615e52
Processing 3 : c8d32a3e-d3ec-490e-904f-e48c9aa3eaa1
Processing 4 : 76fdb0c1-1d8c-4a43-a709-eb7eeb1d674e
Processing 5 : 88e0d165-57a9-4762-8b89-86321757c943
Processing 6 : 4b07173d-9e6c-4895-ad62-c28fc1276778
Processing 7 : f267e0f2-cd70-41c3-80cd-8f3bf1d8c881
Processing 8 : 85d4e170-dd22-4592-bc0e-870e9ec9835b
Processing 9 : 005c0524-4403-4d30-bdc4-c7a13e81c9a7
Processing 10 : a7171c9f-eb3b-4b96-9709-993522810144
Processing 11 : 9e75a85e-255e-4f4d-8bf5-713587c1ba5e
Processing 12 : a214acff-191d-4ddb-8ff5-0c9bfa5a68d0
Processing 13 : 30c1dae9-1976-4061-93e5-5059cacea5bd
Processing 14 : abef512f-da52-4178-bc10-4cd06ef07c35
Processing 15 : 96874829-54c3-4d16-b2f3-5302e4eed962
Processing 16 : 6dab1ebe-68f4-4014-9a30-0978489f7571
Processing 17 : a091124b-4c6a-4a72-8e00-908b00fb5ad9
Processing 18 : 9d7c683f-fe5f-4432-890b-163499558415
Processing 19 : 69312c82-94f9-4d81-9a32-8fc50deff010
Pr

Processing 155 : 0ea3d6d8-04e6-42d7-a2a3-48dd35ba4a9e
Processing 156 : 9884dfaf-af80-4987-8353-99b920b1606d
Processing 157 : f5b1a158-97cc-44a7-975f-b59e17fa8eba
Processing 158 : 662953da-7317-4c9a-8b40-debf3133bbca
Processing 159 : b801cbfe-bc67-42d0-a5e8-b6b96d12d494
Processing 160 : eacde123-5251-41d0-a481-4e47db65b936
Processing 161 : 35f9c9dd-bfbf-4682-b40f-666576365cc8
Processing 162 : d7326edf-9943-4c41-803a-739008e08434
Processing 163 : 0e857a6c-a1f0-4fc2-a82a-7e3a7cda29f2
Processing 164 : 32e8fda8-96e8-401c-a3e3-99fa8a5977d8
Processing 165 : 4e9608c9-442a-4b39-8ed3-3eb92491a9bb
Processing 166 : 1f8d4e60-8e93-43ba-8c90-7e984a1648d4
Processing 167 : e81aaab7-c573-40f2-b1b3-7ed9c411ccc9
Processing 168 : fde557ec-b96e-49a5-9282-31407296282c
Processing 169 : 696b4a10-6181-4dcc-b096-c0c824362091
Processing 170 : ef0f3505-b7a5-416e-aa70-2a24a835933e
Processing 171 : 04d40b0d-dc78-4a60-a12e-53db2d7d3389
Processing 172 : ef3b67f9-915c-4252-b67e-b2a1f70e7831
Processing 173 : 7cddde99-94

In [25]:
proc = pd.DataFrame(arr)

## Dédoublonnage

In [26]:
df = pd.merge(df,proc,on='resource_id',how='left')

In [27]:
df.error_type_x.update(df.error_type_y)

In [28]:
df = df.rename(columns={'error_type_x':'error_type'})
df = df.drop(columns=['error_type_y'])

In [188]:
def dedoublonnage(df, col):
    arr = []
    df = df.sort_values('last_modified', ascending=False)
    valcounts = pd.DataFrame(dfConso.id_pdc_itinerance.value_counts())
    doublons = valcounts[valcounts[col] > 1].index.to_list()
    if "Non concerné" in doublons: doublons.remove("Non concerné")
    for item in doublons:
        lendf = len(df[df[col] == item].index.to_list())
        for i in range(0,lendf-1):
            obj = {}
            obj['resource_id_keep'] = df[df.index == df[df[col] == item].index.to_list()[0]]['resource_id'].iloc[0]
            obj['last_modified_keep'] = df[df.index == df[df[col] == item].index.to_list()[0]]['last_modified'].iloc[0]
            obj['resource_id_delete'] = df[df.index == df[df[col] == item].index.to_list()[i+1]]['resource_id'].iloc[0]
            obj['last_modified_delete'] = df[df.index == df[df[col] == item].index.to_list()[i+1]]['last_modified'].iloc[0]
            obj[col] = item
            arr.append(obj)
            df = df.drop(df[df[col] == item].index.to_list()[1:])
    return df, arr

In [189]:
versions = df['version'].dropna().unique().tolist()
for v in ["2.0.0"]:
    print(v)
    cpt = 0
    arr = []
    for index,row in df[df['version'] == v].iterrows():
        with open(row['local_file'],'rb') as f:
            encoding = chardet.detect(f.read()).get('encoding')  
            if(encoding == 'Windows-1254'):
                encoding = 'iso-8859-1'

        if cpt == 0:
            dfConso = pd.read_csv(row['local_file'], sep=None, engine="python",dtype=str,encoding=encoding)
            dfConso['last_modified'] = row['resource_last_modified']
            dfConso['resource_id'] = row['resource_id']
            cpt += 1
        else:
            dfinter = pd.read_csv(row['local_file'], sep=None, engine="python",dtype=str,encoding=encoding)
            dfinter['last_modified'] = row['resource_last_modified']
            dfinter['resource_id'] = row['resource_id']
            dfConso = pd.concat([dfConso, dfinter], ignore_index=True)
            # Dédoublonnage here
        dfConso, arrinter = dedoublonnage(dfConso, 'id_pdc_itinerance')
        arr = arr + arrinter

    dfConso = dfConso.drop_duplicates()
    dfConso.to_csv('irve-'+v+'.csv',index=False, encoding="utf-8",na_rep='null')

2.0.0


In [190]:
log2 = pd.DataFrame(arr)

In [192]:
log2.to_csv("logs/log-doublons-"+datetime.now().strftime('%Y%m%d')+".csv",index=False)

In [None]:
df['error_type'] = df['error_type'].apply(lambda x: 'no-error' if x == None else x)

In [None]:
df.drop(columns=['local_file']).to_csv("logs/log-general-"+datetime.now().strftime('%Y%m%d')+".csv",index=False)