In [3]:
import json
from collections import defaultdict, Counter
import pickle
import re
import glob
import os

from unidecode import unidecode
import urllib.parse
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Parse actions to dataset ids

In [4]:
regex_home = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/$'
regex_datasets_page = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/datasets/$'
regex_datasets = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/datasets/([a-z\-0-9]+)/$'
regex_datasets_search = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/datasets/\?'
regex_organizations = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/organizations/'
regex_reuses = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/reuses/'
regex_territory = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/territory/'
regex_topics = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/topics/'
regex_town = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/town/'
regex_users = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/users/'
regex_search = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/search/'
regex_posts = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/posts/'
regex_faq = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/faq/'
regex_dashboard = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/dashboard/'
regex_apidoc = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/apidoc/'
regex_groups = r'^https?://www\.data\.gouv\.fr/groups/'
regex_reference = r'^https?://www\.data\.gouv\.fr/(?:fr|en|es)/reference$'
regex_id = r'^https?://id\.data\.gouv\.fr/'
regex_api_spatial = r'^https?://www\.data\.gouv\.fr/api/1/spatial/'
regex_api_discussions = r'^https?://www\.data\.gouv\.fr/api/1/discussions/\?'
regex_api_issues = r'^https?://www\.data\.gouv\.fr/api/1/issues/\?'
regex_api_swagger = r'^https?://www\.data\.gouv\.fr/api/1/swagger\.json'
regex_api_datasets_frequencies = r'^https?://www\.data\.gouv\.fr/api/1/datasets/frequencies/'
regex_api_reuses = r'^https?://www\.data\.gouv\.fr/api/1/reuses/'
regex_api_metrics = r'^https?://www\.data\.gouv\.fr/api/1/metrics/'
regex_api_activity = r'^https?://www\.data\.gouv\.fr/api/1/activity'
regex_api_organizations = r'^https?://www\.data\.gouv\.fr/api/1/organizations/'
regex_api_datasets = r'^https?://www\.data\.gouv\.fr/api/1/datasets/([a-z\-0-9]+)/'
regex_api_datasets_search = r'^https?://www\.data\.gouv\.fr/api/1/datasets/\?'
regex_api_community = r'^https?://www\.data\.gouv\.fr/api/1/datasets/community_resources/\?'
regex_api_oembeds = r'^https?://www\.data\.gouv\.fr/api/1/oembeds/'
regex_api_me = r'^https?://www\.data\.gouv\.fr/api/1/me/'
regex_api_site = r'^https?://www\.data\.gouv\.fr/api/1/site/'
regex_api_users = r'^https?://www\.data\.gouv\.fr/api/1/users/'
regex_api_transfer = r'^https?://www\.data\.gouv\.fr/api/1/transfer/'
regex_api_harvest = r'^https?://www\.data\.gouv\.fr/api/1/harvest/'


def parse_visits(visits):
    unhandled_actions = []
    visits_parsed = []
    for visit in visits:
        visit_parsed = []
        for action in visit['actions']:
            match_home = re.match(regex_home, action['url'])
            match_datasets_page = re.match(regex_datasets_page, action['url'])
            match_datasets = re.match(regex_datasets, action['url'])
            match_datasets_search = re.match(regex_datasets_search, action['url'])
            match_organizations = re.match(regex_organizations, action['url'])
            match_reuses = re.match(regex_reuses, action['url'])
            match_territory = re.match(regex_territory, action['url'])
            match_topics = re.match(regex_topics, action['url'])
            match_town = re.match(regex_town, action['url'])
            match_users = re.match(regex_users, action['url'])
            match_search = re.match(regex_search, action['url'])
            match_posts = re.match(regex_posts, action['url'])
            match_faq = re.match(regex_faq, action['url'])
            match_dashboard = re.match(regex_dashboard, action['url'])
            match_apidoc = re.match(regex_apidoc, action['url'])
            match_groups = re.match(regex_groups, action['url'])
            match_reference = re.match(regex_reference, action['url'])
            match_id = re.match(regex_id, action['url'])
            match_api_spatial = re.match(regex_api_spatial, action['url'])
            match_api_discussions = re.match(regex_api_discussions, action['url'])
            match_api_issues = re.match(regex_api_issues, action['url'])
            match_api_swagger = re.match(regex_api_swagger, action['url'])
            match_api_datasets_frequencies = re.match(regex_api_datasets_frequencies, action['url'])
            match_api_reuses = re.match(regex_api_reuses, action['url'])                
            match_api_metrics = re.match(regex_api_metrics, action['url'])                
            match_api_activity = re.match(regex_api_activity, action['url'])                
            match_api_organizations = re.match(regex_api_organizations, action['url'])                
            match_api_datasets = re.match(regex_api_datasets, action['url'])                
            match_api_datasets_search = re.match(regex_api_datasets_search, action['url'])                
            match_api_community = re.match(regex_api_community, action['url'])                
            match_api_oembeds = re.match(regex_api_oembeds, action['url'])                
            match_api_me = re.match(regex_api_me, action['url'])                
            match_api_site = re.match(regex_api_site, action['url'])                
            match_api_users = re.match(regex_api_users, action['url'])                
            match_api_transfer = re.match(regex_api_transfer, action['url'])                
            match_api_harvest = re.match(regex_api_harvest, action['url'])                
            
            if action['type'] == 'search':
                visit_parsed.append(('keyword', action['keyword']))
                
            elif action['type'] == 'action':
                if match_home:
                    pass
                elif match_datasets_page:
                    pass
                elif match_datasets:
                    dataset_slug = match_datasets.groups()[0]
                    visit_parsed.append(('slug_or_id', dataset_slug))
                elif match_datasets_search:
                    pass
                elif match_organizations:
                    pass
                elif match_reuses:
                    pass
                elif match_territory:
                    pass
                elif match_topics:
                    pass
                elif match_town:
                    pass
                elif match_users:
                    pass
                elif match_search:
                    pass
                elif match_posts:
                    pass
                elif match_faq:
                    pass
                elif match_dashboard:
                    pass
                elif match_apidoc:
                    pass
                elif match_groups:
                    pass
                elif match_reference:
                    pass
                elif match_id:
                    pass
                elif match_api_spatial:
                    pass
                elif match_api_discussions:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit_parsed.append(('id', dataset_id))
                elif match_api_issues:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit_parsed.append(('id', dataset_id))
                elif match_api_swagger:
                    pass
                elif match_api_datasets_frequencies:
                    pass
                elif match_api_reuses:
                    pass
                elif match_api_metrics:
                    pass
                elif match_api_activity:
                    pass
                elif match_api_organizations:
                    pass
                elif match_api_datasets:
                    dataset_id = match_api_datasets.groups()[0]
                    visit_parsed.append(('slug_or_id', dataset_id))
                elif match_api_datasets_search:
                    pass
                elif match_api_community:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit_parsed.append(('id', dataset_id))
                elif match_api_oembeds:
                    pass
                elif match_api_me:
                    pass
                elif match_api_site:
                    pass
                elif match_api_users:
                    pass
                elif match_api_transfer:
                    pass
                elif match_api_harvest:
                    pass
                else:
                    unhandled_actions.append(action)
                    
            elif action['type'] == 'goal':               
                if match_reuses:
                    pass
                elif match_datasets:
                    dataset_slug = match_datasets.groups()[0]
                    visit_parsed.append(('slug_or_id', dataset_slug))
                elif match_api_reuses:
                    pass
                elif match_id:
                    pass
                elif match_api_organizations:
                    pass
                elif match_api_transfer:
                    pass
                else:
                    unhandled_actions.append(action)
                    
            elif action['type'] == 'outlink':
                pass
            
            elif action['type'] == 'download':
                pass
            
            else:
                unhandled_actions.append(action)
        visits_parsed.append(visit_parsed)
        
    return visits_parsed, unhandled_actions

In [7]:
source_dir = 'deduplicated_logs'
target_dir = 'parsed_logs'

filenames = glob.glob(source_dir + '/*')

unhandled_actions = []
for filename in filenames:
    with open(filename, 'r') as f:
        visits = json.load(f)
    visits_parsed, unhandled_actions_file = parse_visits(visits)
    unhandled_actions += unhandled_actions_file
    with open(os.path.join(target_dir, os.path.basename(filename)), 'w') as f:
        json.dump(visits_parsed, f)

In [8]:
len(unhandled_actions)

15534

In [9]:
c = Counter([s['url'][:43] for s in unhandled_actions])
c.most_common(30)


[('https://www.data.gouv.fr/api/1/datasets/', 1630),
 ('https://www.data.gouv.fr/fr/terms/', 895),
 ('https://www.data.gouv.fr/api/1/discussions/', 819),
 ('https://www.data.gouv.fr/fr/dataconnexions-', 765),
 ('https://www.data.gouv.fr/datasets/extractio', 400),
 ('http://www.data.gouv.fr/donnees/view/Table-', 345),
 ('https://www.data.gouv.fr/fr/oauth/authorize', 332),
 ('https://www.data.gouv.fr/api/1/datasets/com', 310),
 ('https://www.data.gouv.fr/fr/credits/', 301),
 ('http://www.data.gouv.fr/fr/Redevances', 274),
 ('http://www.data.gouv.fr/fr/openfield16', 260),
 ('http://www.data.gouv.fr/DataSet/573376?xtmc', 235),
 ('https://www.data.gouv.fr/api/1/posts/58f9bb', 234),
 ('http://www.data.gouv.fr/content/search?Sear', 222),
 ('http://www.data.gouv.fr/DataSet/30382152', 217),
 ('https://www.data.gouv.fr/wiki/Outillage_pou', 195),
 ('http://www.data.gouv.fr/var/download/SSA4_A', 188),
 ('https://www.data.gouv.fr/frmetrics', 185),
 ('http://www.data.gouv.fr/api/1/discussions/', 175

## Deduplication + slug to id

In [12]:
dataset_info = pd.read_csv('datasets-2017-06-27-15-14.csv', sep=';')

id2slug = {r[1]: r[3] for r in dataset_info.itertuples()}
slug2id = {r[3]: r[1] for r in dataset_info.itertuples()}

datasets = list(dataset_info.id)

datasets_index = {dataset: i for i, dataset in enumerate(datasets)}
datasets_set = set(datasets)
slugs_set = set(slug2id.keys())

In [13]:
def deduplicate(visits):
    visits_dedup = []
    for visit in visits:
        visit_dedup = []
        sets = {
            'keyword': set(),
            'id': set(),
            'slug_or_id': set(),
        }
        for kind, value in visit:
            if value not in sets[kind]:
                sets[kind].add(value)
                visit_dedup.append((kind, value))
        visits_dedup.append(visit_dedup)
    return visits_dedup

def resolve_slugs(visits):
    visits_clean = []
    id_ok = 0
    id_unknown = []
    slug_ok = 0
    slug_unknown = []
    for visit in visits:
        visit_clean = []
        for kind, value in visit:
            if kind == 'keyword':
                visit_clean.append((kind, value))

            elif kind == 'id':
                if value in datasets_set:
                    visit_clean.append((kind, value))
                    id_ok += 1
                else:
                    id_unknown.append(value)

            elif kind == 'slug_or_id':
                if value in datasets_set:
                    visit_clean.append(('id', value))
                    id_ok += 1
                elif value in slugs_set:
                    dataset_id = slug2id[value]
                    visit_clean.append(('id', dataset_id))
                    slug_ok += 1
                else:
                    if re.match(r'^[0-9a-f]{24}$', value):
                        id_unknown.append(value)
                    else:
                        slug_unknown.append(value)

            else:
                raise ValueError()
        visits_clean.append(visit_clean)
        
    return visits_clean, id_ok, id_unknown, slug_ok, slug_unknown

In [16]:
source_dir = 'parsed_logs'
target_dir = 'clean_logs'

filenames = glob.glob(source_dir + '/*')

id_ok = 0
id_unknown = []
slug_ok = 0
slug_unknown = []
for filename in filenames:
    with open(filename, 'r') as f:
        visits_parsed = json.load(f)
    
    visits_dedup = deduplicate(visits_parsed)
    visits_clean, id_ok_file, id_unknown_file, slug_ok_file, slug_unknown_file = resolve_slugs(visits_dedup)
    
    id_ok += id_ok_file
    id_unknown += id_unknown_file
    slug_ok += slug_ok_file
    slug_unknown += slug_unknown_file
        
    with open(os.path.join(target_dir, os.path.basename(filename)), 'w') as f:
        json.dump(visits_clean, f)


In [17]:
id_ok, len(id_unknown), slug_ok, len(slug_unknown)

(331082, 55968, 1216745, 52196)

In [18]:
sum_ok = id_ok + slug_ok
sum_unknown = len(id_unknown) + len(slug_unknown)
sum_ok, sum_unknown, sum_unknown / (sum_ok + sum_unknown), sum_ok / (sum_ok + sum_unknown)

(1547827, 108164, 0.0653167801032735, 0.9346832198967265)

$$\int_0^1{t_a^{x/2}dx}=0.8886215632082586$$

$$x = \frac{\ln(t_a)}{2}, e^x-1 = 0.8886215632082586x$$

$$x = -0.2410056126332082, t_a = 0.6175401296732944$$

38% de turnover annuel

In [19]:
id_unknown[:10]

['57e1017b88ee38414d5ff491',
 '574da090c751df32e7535dd6',
 '574da090c751df32e7535dd6',
 '57e1017b88ee38414d5ff491',
 '574da090c751df32e7535dd6',
 '57e1017b88ee38414d5ff491',
 '57d16d62c751df60f597bae5',
 '5888e0ae88ee3808539b81a4',
 '5882a14ac751df3f9cae0a68',
 '5882a14ac751df3f9cae0a68']

In [20]:
slug_unknown[:10]

['chiffres-du-registre-des-francais-etablis-hors-de-france-pour-lannee-2015',
 'montants-d-aides-pour-les-200-titres-de-presse-les-plus-aides',
 'petites-regions-fourrageres-du-limousin',
 'table-de-correspondance-des-communes-et-des-cantons-avec-les-circonscriptions-legislat-551418',
 'niveau-des-debits-sur-les-reseaux-dacces-a-internet-adsl-cable-fibre-ftth-3',
 'ppr-orthez-64ddtm20000008-zone-dinformations-dun-plan-de-prevention-du-risque-inondation-de-orthez-64430-departement-des-pyrenees-atlantiques',
 'ppr-de-orthez-64ddtm20000008-plan-de-prevention-des-risques-naturels-pprn-de-la-commune-de-orthez-64430-departement-des-pyrenees-atlantiques',
 'education-effectifs-des-ecoles-publiques-maternelles',
 'badges',
 'extraction-du-fichier-national-des-etablissements-sanitaires-et-sociaux-finess-par-etablissements']

In [21]:
c = Counter(slug_unknown)
c.most_common(10)

[('extraction-du-fichier-national-des-etablissements-sanitaires-et-sociaux-finess-par-etablissements',
  4016),
 ('chiffres-du-registre-des-francais-etablis-hors-de-france-pour-lannee-2015',
  1794),
 ('table-de-correspondance-des-communes-et-des-cantons-avec-les-circonscriptions-legislat-551418',
  1666),
 ('parrainages', 1409),
 ('resultats-du-vote-des-francais-residant-a-letranger-au-premier-tour-de-lelection-presidentielle-2017',
  1406),
 ('badges', 1405),
 ('licenses', 1141),
 ('resultats-des-elections-presidentielles-par-commune', 987),
 ('plan-cadastral-informatise', 817),
 ('liste-des-bureaux-de-vote-prs', 801)]

# Merge

In [22]:
source_dir = 'clean_logs'

filenames = glob.glob(source_dir + '/*')

visits_by_day = {}
for filename in filenames:
    day = os.path.basename(filename)[:10]
    assert re.match(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', day), filename
    
    with open(filename, 'r') as f:
        visits = json.load(f)
        
    visits_by_day[day] = visits

with open('visits_by_day.json', 'w') as f:
    json.dump(visits_by_day, f)

# Clean keyword

In [23]:
def clean_keyword(keyword):
    return unidecode(keyword.lower())

clean_keyword('Eléctions')

'elections'

In [24]:
with open('visits_by_day.json', 'r') as f:
    visits_by_day = json.load(f)

visits_by_day2 = {}
for day, visits in visits_by_day.items():
    visits2 = []    
    for visit in visits:
        visit2 = []
        for kind, value in visit:
            if (kind == 'keyword'):
                visit2.append((kind, clean_keyword(value)))
            else:
                visit2.append((kind, value))
        visits2.append(visit2)
    visits_by_day2[day] = visits2
    
with open('visits_by_day2.json', 'w') as f:
    json.dump(visits_by_day2, f)