# Déclaration de profils d'acheteur - DECP

In [None]:
import requests
import agate

from pathlib import Path
import cchardet as chardet

In [None]:
# get the list of datasets tagged DECP on data.gouv.fr

url = 'https://www.data.gouv.fr/api/1/datasets/?tag=decp&page_size=1000'
r = requests.get(url)
datasets = r.json()['data']

len(datasets)

In [None]:
# Download all tabular files in data/ directory, as best as we can

downloaded = []
for d in datasets:
    if not d['organization'] and not d['owner']:
        print('======= no owner or orga !!! ==========', d['slug'])
        continue
    orga = d['organization']['slug'] if d['organization'] else d['owner']['slug']
    slug = d['slug']
    for r in d['resources']:
        rurl = r['url']
        rid = r['id']
        # ODS style NB: won't work more than once for CKAN
        if 'format=csv' in rurl:
            filename = rurl.split('/')[-3] + '.csv'
        else:
            filename = rurl.split('/')[-1]
        if filename in downloaded:
            print('x existing file %s' % rurl)
            continue
        ext = filename.split('.')[-1]
        if ext not in ['csv']:
            print('x ignored file %s' % rurl)
            continue
        r = requests.get(rurl, allow_redirects=True)
        p = Path('data/%s' % slug)
        p.mkdir(exist_ok=True, parents=True)
        written_filename = '%s.%s' % (rid, ext) 
        with open('%s/%s' % (p, written_filename), 'wb') as f:
            f.write(r.content)
            downloaded.append(filename)
            print('- downloaded file [%s] %s' % (filename, rurl))
print('Done')

In [None]:
def parse_csv(file_path):
    # deactivate type testing, this puts too much constraint on parsing
    # especially for lat/lon columns with commas
    tester = agate.TypeTester(types=(agate.Text, ), limit=0)
    with file_path.open('rb') as f:
        encoding = chardet.detect(f.read()).get('encoding')
    try:
        # /!\ force delimiter
        # TODO maybe open with ; and if only one column, try with ',' or delim
        # we're working with small files here which can explain sniffing is not so good
        table = agate.Table.from_csv(file_path, encoding=encoding, sniff_limit=None, column_types=tester, delimiter=';')
    except Exception as e:
        print('[x] CSV parse error for %s (%s)' % (file_path, e))
    else:
        return table

In [None]:
columns = ['siretAcheteur', 'urlProfilAcheteur', 'coordonnnées']
columns_low = [x.lower() for x in columns]

columns_mapping = [
    ('siretAcheteur', 'siret'), 
    ('urlProfilAcheteur', 'url'), 
    ('coordonnnées', 'coordonnnees'), 
]

In [None]:
# parse CSV files: analyze difference between standard schema and real data
# infer a mapping of column names variation to match a maximum of data later

# /!\ this is only an analysis step that helped build `columns_mapping` above
# you do not need to run this when doing a consolidation

p = Path('data')
for child in [x for x in p.iterdir() if x.is_dir()]:
    csvs = list(child.glob('*.csv'))
    for csv in csvs:
        table = parse_csv(csv)
        if table:
            cols = [x.lower() for x in table.column_names]
            missing_pivot = []
            # TODO also match `siret`
            for pivot in ['siretacheteur']:
                if pivot not in cols:
                    missing_pivot.append(pivot)
            if missing_pivot:
                print('Skipping %s for missing pivot %s, cols were %s' % (csv, missing_pivot, cols))
                break
            diff = list(set(cols) - set(columns_low))
            if diff:
                print('DIFF for %s: %s' % (csv, diff))
print('Done.')

In [None]:
# use columns_mapping to build a database of unique stations based on (id_station, id_pdc, max(date_maj))

profils = []

def find_by_pivot(row, lines):
    pass

def ifind_in_row_by_col(col, row):
    for key in row.keys():
        if col.lower() == key.lower():
            return row[key]

p = Path('data')

# TODO assign a weight based on number of lines in CSV file
# if duplicate, keep the one from the file w/ highest weight

for child in [x for x in p.iterdir() if x.is_dir()]:
    csvs = list(child.glob('*.csv'))
    for csv in csvs:
        table = parse_csv(csv)
        if table:
            table_cols = [x.lower() for x in table.column_names]
            missing_pivot = []
            # TODO also match `siret`
            for pivot in ['siretacheteur']:
                if pivot not in table_cols:
                    missing_pivot.append(pivot)
            if missing_pivot:
                break
            for row in table.rows:
                profil = {}
                for col in columns_mapping:
                    for c in col:
                        if c.lower() in table_cols:
                            # use the standard name for column
                            profil[col[0]] = ifind_in_row_by_col(c, row)
                            break
                # remove empty lines
                if not profil.get('siretAcheteur'):
                    continue
                upatt = 'https://www.data.gouv.fr/fr/datasets/%s/#resource-%s'
                profil['source'] = upatt % (csv.parents[0].stem, csv.stem)
                # normalize siretAcheteur
                profil['siretAcheteur'] = profil['siretAcheteur'].strip().replace(' ', '')
                profils.append(profil)
                
len(profils)

In [None]:
# enjoy the results!

import csv

with open('decp-dpa.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=columns + ['source'], delimiter=';')
    writer.writeheader()
    for b in profils:
        writer.writerow(b)

In [None]:
import pandas as pd

df = pd.read_csv('decp-dpa.csv', sep=';')
siret = pd.DataFrame(df.groupby('siretAcheteur').size())
siret[siret[0] > 1][0].sum()

## debug / test

In [None]:
source = 'data/declaration-du-profil-acheteur/e99030a3-b258-42a3-9b20-13f0632bb473.csv'
table = agate.Table.from_csv(source, encoding="latin-1", delimiter=';')
table.column_names