In [2]:
import json
from collections import defaultdict, Counter
import pickle
import re

import urllib.parse
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Slug <-> id

In [2]:
dataset_info = pd.read_csv('datasets-2017-06-21-19-43.csv', sep=';')

In [3]:
len(dataset_info)

25794

In [4]:
id2slug = {r[1]: r[3] for r in dataset_info.itertuples()}
slug2id = {r[3]: r[1] for r in dataset_info.itertuples()}

In [5]:
datasets = list(dataset_info.id)

In [6]:
datasets_index = {dataset: i for i, dataset in enumerate(datasets)}
datasets_set = set(datasets)
slugs_set = set(slug2id.keys())

# Load piwik logs (deduplicated)

Group by visit

In [3]:
with open('keywords.json', 'r') as f:
    data = json.load(f)

In [4]:
len(data)

631065

In [10]:
searches_by_visit = defaultdict(list)
for search in data:
    idVisit = search['idVisit']
    searches_by_visit[idVisit].append(search)

In [11]:
len(searches_by_visit)

154253

In [12]:
for idVisit, searches in searches_by_visit.items():
    print(searches)
    break

[{'idVisit': '4778378', 'url': '', 'date': '2017-01-05', 'keyword': 'sirene', 'actions': [{'idVisit': '4778378', 'date': '2017-01-05', 'url': 'https://www.data.gouv.fr/fr/datasets/base-sirene-des-entreprises-et-de-leurs-etablissements-siren-siret/', 'type': 'action'}, {'idVisit': '4778378', 'date': '2017-01-05', 'url': 'http://files.data.gouv.fr/sirene/sirene_201612_L_M.zip', 'type': 'download'}], 'type': 'search'}]


# Keyword list

In [None]:
keyword_count = defaultdict(int)
for search in data:
    keyword = search['keyword']
    keyword_count[keyword] += 1

In [None]:
len(keyword_count)

In [None]:
counts = [couple[1] for couple in keyword_count.items()]
max(counts)
count_histogram = np.zeros((max(counts) + 1))
for count in counts:
    count_histogram[count] += 1

In [None]:
plt.plot(count_histogram[:10])

In [None]:
sum(count_histogram[4:])

# Keyword, keyword

In [None]:
# Execute this only once
#keywords = [keyword for keyword, count in keyword_count.items() if count >= 4]
#pickle.dump(keywords, open('keywords_list.pickle', 'wb'))

In [None]:
keywords = pickle.load(open('keywords_list.pickle', 'rb'))
keywords_set = set(keywords)
keywords_index = {keyword: i for i, keyword in enumerate(keywords)}

In [None]:
n_keywords = len(keywords)
n_keywords

In [None]:
# Matrix representation
kk = np.zeros((n_keywords, n_keywords), dtype=np.uint16)
for idVisit, searches in searches_by_visit.items():
    visit_keywords = list(set([search['keyword'] for search in searches]))
    visit_keywords_kept = [k for k in visit_keywords if k in keywords_set]
    for k1 in visit_keywords_kept:
        index1 = keywords_index[k1]
        for k2 in visit_keywords_kept:
            index2 = keywords_index[k2]
            if kk[index1, index2] == 65535:
                print("overflow")
            kk[index1, index2] += 1
kk.max(), float((kk>0).sum())/(n_keywords*n_keywords)

In [None]:
# Adjacency list representation
kk = [defaultdict(int) for i in range(n_keywords)]
for idVisit, searches in searches_by_visit.items():
    visit_keywords = list(set([search['keyword'] for search in searches]))
    visit_keywords_kept = [k for k in visit_keywords if k in keywords_set]
    for k1 in visit_keywords_kept:
        index1 = keywords_index[k1]
        for k2 in visit_keywords_kept:
            index2 = keywords_index[k2]
            kk[index1][index2] += 1

In [None]:
kk = [dict(adjacency_list) for adjacency_list in kk]

In [None]:
pickle.dump(kk, open('keyword_keyword_adjacency_list.pickle', 'wb'))

# Parse actions to dataset ids

In [31]:
other_pages = ['http://www.data.gouv.fr/fr/', 'http://www.data.gouv.fr/fr/datasets/']
regex_datasets = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/datasets/([a-z\-0-9]+)/$'
regex_datasets_search = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/datasets/\?'
regex_organizations = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/organizations/'
regex_reuses = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/reuses/'
regex_territory = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/territory/'
regex_topics = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/topics/'
regex_town = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/town/'
regex_users = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/users/'
regex_search = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/search/'
regex_posts = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/posts/'
regex_faq = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/faq/'
regex_dashboard = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/dashboard/'
regex_apidoc = r'^https?://www\.data\.gouv\.fr/(?:fr|en)/apidoc/'
regex_groups = r'^https?://www\.data\.gouv\.fr/groups/'
regex_reference = r'^https?://www\.data\.gouv\.fr/reference$'
regex_login = r'^https?://id\.data\.gouv\.fr/login/'
regex_logout = r'^https?://id\.data\.gouv\.fr/logout/'
regex_register = r'^https?://id\.data\.gouv\.fr/register/'
regex_api_spatial = r'^https?://www\.data\.gouv\.fr/api/1/spatial/'
regex_api_discussions = r'^https?://www\.data\.gouv\.fr/api/1/discussions/\?'
regex_api_issues = r'^https?://www\.data\.gouv\.fr/api/1/issues/\?'
regex_api_swagger = r'^https?://www\.data\.gouv\.fr/api/1/swagger\.json'
regex_api_datasets_frequencies = r'^https?://www\.data\.gouv\.fr/api/1/datasets/frequencies/'
regex_api_reuses = r'^https?://www\.data\.gouv\.fr/api/1/reuses/'
regex_api_metrics = r'^https?://www\.data\.gouv\.fr/api/1/metrics/'
regex_api_activity = r'^https?://www\.data\.gouv\.fr/api/1/activity'
regex_api_organizations = r'^https?://www\.data\.gouv\.fr/api/1/organizations/'
regex_api_datasets = r'^https?://www\.data\.gouv\.fr/api/1/datasets/([a-z\-0-9]+)/'
regex_api_datasets_search = r'^https?://www\.data\.gouv\.fr/api/1/datasets/\?'
regex_api_community = r'^https?://www\.data\.gouv\.fr/api/1/datasets/community_resources/\?'
regex_api_oembeds = r'^https?://www\.data\.gouv\.fr/api/1/oembeds/'
regex_api_me = r'^https?://www\.data\.gouv\.fr/api/1/me/'
regex_api_site = r'^https?://www\.data\.gouv\.fr/api/1/site/'
regex_api_users = r'^https?://www\.data\.gouv\.fr/api/1/users/'
regex_api_transfer = r'^https?://www\.data\.gouv\.fr/api/1/transfer/'
regex_api_harvest = r'^https?://www\.data\.gouv\.fr/api/1/harvest/'

unhandled_actions = []
visits = []
for idVisit, searches in searches_by_visit.items():
    visit = []
    for search in searches:
        visit.append(('keyword', search['keyword']))
        for action in search['actions']:
            if action['type'] == 'action':
                match_datasets = re.match(regex_datasets, action['url'])
                match_datasets_search = re.match(regex_datasets_search, action['url'])
                match_organizations = re.match(regex_organizations, action['url'])
                match_reuses = re.match(regex_reuses, action['url'])
                match_territory = re.match(regex_territory, action['url'])
                match_topics = re.match(regex_topics, action['url'])
                match_town = re.match(regex_town, action['url'])
                match_users = re.match(regex_users, action['url'])
                match_search = re.match(regex_search, action['url'])
                match_posts = re.match(regex_posts, action['url'])
                match_faq = re.match(regex_faq, action['url'])
                match_dashboard = re.match(regex_dashboard, action['url'])
                match_apidoc = re.match(regex_apidoc, action['url'])
                match_groups = re.match(regex_groups, action['url'])
                match_reference = re.match(regex_reference, action['url'])
                match_login = re.match(regex_login, action['url'])
                match_logout = re.match(regex_logout, action['url'])
                match_register = re.match(regex_register, action['url'])
                match_api_spatial = re.match(regex_api_spatial, action['url'])
                match_api_discussions = re.match(regex_api_discussions, action['url'])
                match_api_issues = re.match(regex_api_issues, action['url'])
                match_api_swagger = re.match(regex_api_swagger, action['url'])
                match_api_datasets_frequencies = re.match(regex_api_datasets_frequencies, action['url'])
                match_api_reuses = re.match(regex_api_reuses, action['url'])                
                match_api_metrics = re.match(regex_api_metrics, action['url'])                
                match_api_activity = re.match(regex_api_activity, action['url'])                
                match_api_organizations = re.match(regex_api_organizations, action['url'])                
                match_api_datasets = re.match(regex_api_datasets, action['url'])                
                match_api_datasets_search = re.match(regex_api_datasets_search, action['url'])                
                match_api_community = re.match(regex_api_community, action['url'])                
                match_api_oembeds = re.match(regex_api_oembeds, action['url'])                
                match_api_me = re.match(regex_api_me, action['url'])                
                match_api_site = re.match(regex_api_site, action['url'])                
                match_api_users = re.match(regex_api_users, action['url'])                
                match_api_transfer = re.match(regex_api_transfer, action['url'])                
                match_api_harvest = re.match(regex_api_harvest, action['url'])                
                
                if action['url'] in other_pages:
                    pass
                elif match_datasets:
                    dataset_slug = match_datasets.groups()[0]
                    visit.append(('slug_or_id', dataset_slug))
                elif match_datasets_search:
                    pass
                elif match_organizations:
                    pass
                elif match_reuses:
                    pass
                elif match_territory:
                    pass
                elif match_topics:
                    pass
                elif match_town:
                    pass
                elif match_users:
                    pass
                elif match_search:
                    pass
                elif match_posts:
                    pass
                elif match_faq:
                    pass
                elif match_dashboard:
                    pass
                elif match_apidoc:
                    pass
                elif match_groups:
                    pass
                elif match_reference:
                    pass
                elif match_login:
                    pass
                elif match_logout:
                    pass
                elif match_register:
                    pass
                elif match_api_spatial:
                    pass
                elif match_api_discussions:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit.append(('id', dataset_id))
                elif match_api_issues:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit.append(('id', dataset_id))
                elif match_api_swagger:
                    pass
                elif match_api_datasets_frequencies:
                    pass
                elif match_api_reuses:
                    pass
                elif match_api_metrics:
                    pass
                elif match_api_activity:
                    pass
                elif match_api_organizations:
                    pass
                elif match_api_datasets:
                    dataset_id = match_api_datasets.groups()[0]
                    visit.append(('slug_or_id', dataset_id))
                elif match_api_datasets_search:
                    pass
                elif match_api_community:
                    parsed = urllib.parse.urlparse(action['url'])
                    params = urllib.parse.parse_qs(parsed.query)
                    if 'for' in params:
                        dataset_id = params['for'][0]
                        visit.append(('id', dataset_id))
                elif match_api_oembeds:
                    pass
                elif match_api_me:
                    pass
                elif match_api_site:
                    pass
                elif match_api_users:
                    pass
                elif match_api_transfer:
                    pass
                elif match_api_harvest:
                    pass
                else:
                    unhandled_actions.append(action)
            elif action['type'] == 'goal':
                match_datasets = re.match(regex_datasets, action['url'])
                match_reuses = re.match(regex_reuses, action['url'])
                match_login = re.match(regex_login, action['url'])
                match_register = re.match(regex_register, action['url'])
                match_api_reuses = re.match(regex_api_reuses, action['url'])                
                match_api_organizations = re.match(regex_api_organizations, action['url'])                
                match_api_transfer = re.match(regex_api_transfer, action['url'])                
                if match_reuses:
                    pass
                elif match_datasets:
                    dataset_slug = match_datasets.groups()[0]
                    visit.append(('slug_or_id', dataset_slug))
                elif match_api_reuses:
                    pass
                elif match_login:
                    pass
                elif match_register:
                    pass
                elif match_api_organizations:
                    pass
                elif match_api_transfer:
                    pass
                else:
                    unhandled_actions.append(action)
            elif action['type'] == 'outlink':
                pass
            elif action['type'] == 'download':
                pass
            else:
                unhandled_actions.append(action)
    visits.append(visit)

In [32]:
len(unhandled_actions)

5428

In [33]:
unhandled_actions[:100]

[{'date': '2017-03-28',
  'idVisit': '6624854',
  'type': 'action',
  'url': 'https://www.data.gouv.fr/api/1/discussions/58c916eb88ee382efe311032/?lang=fr&_=1490688951639'},
 {'date': '2017-03-28',
  'idVisit': '6624854',
  'type': 'action',
  'url': 'https://www.data.gouv.fr/api/1/discussions/58c916eb88ee382efe311032/'},
 {'date': '2017-04-26',
  'idVisit': '7194679',
  'type': 'action',
  'url': 'https://www.data.gouv.fr/datasets/extraction-finess-des-autorisations-d-activites-de-soins/'},
 {'date': '2017-04-14',
  'idVisit': '6958837',
  'type': 'action',
  'url': 'https://id.data.gouv.fr/password/reset/'},
 {'date': '2017-04-14',
  'idVisit': '6958837',
  'type': 'action',
  'url': 'https://id.data.gouv.fr/password/reset/done/'},
 {'date': '2017-04-14',
  'idVisit': '6958837',
  'type': 'action',
  'url': 'https://id.data.gouv.fr/password/reset/'},
 {'date': '2017-04-14',
  'idVisit': '6958837',
  'type': 'action',
  'url': 'https://id.data.gouv.fr/password/reset/done/'},
 {'date':

In [34]:
c = Counter([s['url'][:40] for s in unhandled_actions])
c.most_common(10)


[('https://www.data.gouv.fr/api/1/datasets/', 1568),
 ('https://www.data.gouv.fr/api/1/discussio', 402),
 ('http://www.data.gouv.fr/fr/reference', 300),
 ('https://www.data.gouv.fr/api/1/tags/sugg', 176),
 ('http://www.data.gouv.fr/en/', 166),
 ('https://www.data.gouv.fr/en/datasets/', 165),
 ('https://id.data.gouv.fr/password/reset/', 141),
 ('https://www.data.gouv.fr/api/1/posts/58f', 138),
 ('https://www.data.gouv.fr/fr/terms/', 128),
 ('https://id.data.gouv.fr/password/reset/d', 120)]

In [35]:
[s['url'] for s in unhandled_actions if 'https://www.data.gouv.fr/api/1/datasets/' in s['url']][:20]

['https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/?lang=fr&_=1490465607190',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/1fd91594-f698-4c93-ad9d-f51a97722c9b/?lang=fr&_=1490465607208',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/?lang=fr&_=1490465607230',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/?lang=fr&_=1490465607239',
 'https://www.data.gouv.fr/api/1/datasets/community_resources/45a0fdca-b8fe-4b87-b19a-5b7776658ada/?lang=fr&_=1490465607253',
 'https://www.data.gouv.fr/api/1/datasets/559390f9c751df0139a453ba/followers/',
 'https://www.data.gouv.fr/api/1/datasets/58d13b9d88ee3801a925912d/followers/',
 'http

In [36]:
sum([len(visit) for visit in visits])

1337203

## Deduplication + slug to id

In [13]:
visits_dedup = []
for visit in visits:
    visit_dedup = []
    sets = {
        'keyword': set(),
        'id': set(),
        'slug_or_id': set(),
    }
    for kind, value in visit:
        if value not in sets[kind]:
            sets[kind].add(value)
            visit_dedup.append((kind, value))
    visits_dedup.append(visit_dedup)

In [40]:
visits_clean = []
id_ok = 0
id_unknown = []
slug_ok = 0
slug_unknown = []
for visit_dedup in visits_dedup:
    visit_clean = []
    for kind, value in visit_dedup:
        if kind == 'keyword':
            visit_clean.append((kind, value))
            
        elif kind == 'id':
            if value in datasets_set:
                visit_clean.append((kind, value))
                id_ok += 1
            else:
                id_unknown.append(value)
                
        elif kind == 'slug_or_id':
            if value in datasets_set:
                visit_clean.append(('id', value))
                id_ok += 1
            elif value in slugs_set:
                dataset_id = slug2id[value]
                visit_clean.append(('id', dataset_id))
                slug_ok += 1
            else:
                if re.match(r'^[0-9a-f]{24}$', value):
                    id_unknown.append(value)
                else:
                    slug_unknown.append(value)
                
        else:
            raise ValueError()
    visits_clean.append(visit_clean)

In [42]:
with open('visits_clean.json', 'w') as f:
    json.dump(visits_clean, f)

In [16]:
id_ok, len(id_unknown), slug_ok, len(slug_unknown)

(256543, 44895, 182979, 10194)

In [19]:
sum_ok = id_ok + slug_ok
sum_unknown = len(id_unknown) + len(slug_unknown)
sum_ok, sum_unknown, sum_unknown / (sum_ok + sum_unknown), sum_ok / (sum_ok + sum_unknown)

(439522, 55089, 0.11137843679174139, 0.8886215632082586)

$$\int_0^1{t_a^{x/2}dx}=0.8886215632082586$$

$$x = \frac{\ln(t_a)}{2}, e^x-1 = 0.8886215632082586x$$

$$x = -0.2410056126332082, t_a = 0.6175401296732944$$

38% de turnover annuel

In [None]:
id_unknown[:100]

In [None]:
slug_unknown[:100]

In [None]:
c = Counter(slug_unknown)
c.most_common(100)

In [22]:
badges = []
for idVisit, searches in searches_by_visit.items():
    for search in searches:
        for action in search['actions']:
            if 'datasets/badges' in action['url']:
                badges.append(action['url'])


In [23]:
len(badges), badges[:100]

(1793,
 ['https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1490688825261',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1490688951628',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1486474702413',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1492069886967',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1492100991143',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1492101743871',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1484829591869',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1484829698456',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1486387066078',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1488648816488',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1488648858947',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=1490465607205',
  'https://www.data.gouv.fr/api/1/datasets/badges/?lang=fr&_=14938148