In [34]:
import json
import glob
from collections import Counter, defaultdict
import urllib.parse

# Example : Load

In [4]:
filenames = glob.glob('201*-*-*.json')

In [5]:
filename = filenames[0]

In [7]:
with open(filename, 'r') as f:
    data = json.load(f)


# Example : searches by action

In [11]:
searches_by_action = []
for i in range(len(data)):
    actions = data[i]['actionDetails']
    searches = [action for action in actions if action['type'] == 'search']
    searches = sorted(searches, key = lambda x: x['timestamp'])
    keywords = [search['siteSearchKeyword'] for search in searches]
    searches_by_action.append(keywords)

In [12]:
searches_by_action

[[],
 [],
 ['CA LYON RG : 2003-05004',
  'CA LYON RG : 2003-05004',
  'CA LYON RG : 2003-05004',
  'CA LYON RG : 2003-05004',
  'ch',
  'ch',
  'ch',
  'ch',
  'chr',
  'chr',
  'chr',
  'chr',
  'chris',
  'chris',
  'chris',
  'chris',
  'christ',
  'christ',
  'christ',
  'christ',
  'christop',
  'christop',
  'christop',
  'christop',
  'christophe',
  'christophe',
  'christophe',
  'christophe',
  'elec',
  'elec',
  'elec',
  'elec',
  'electio',
  'electio',
  'electio',
  'electio',
  'election',
  'election',
  'election',
  'election',
  'election p',
  'election p',
  'election p',
  'election p',
  'election pre',
  'election pre',
  'election pre',
  'election pre',
  'election presidentielle',
  'election presidentielle',
  'election presidentielle',
  'election presidentielle',
  'ele',
  'ele',
  'ele',
  'ele',
  'elec',
  'elec',
  'elec',
  'elec',
  'election p',
  'election p',
  'election p',
  'election p',
  'election pré',
  'election pré',
  'election pré',


# Example : fields by action type

In [15]:
fields_by_type = defaultdict(set)
for i in range(len(data)):
    actions = data[i]['actionDetails']
    for action in actions:
        fields_by_type[action['type']] |= set(action.keys())
fields_by_type

defaultdict(set,
            {'action': {'generationTime',
              'icon',
              'pageId',
              'pageIdAction',
              'pageTitle',
              'serverTimePretty',
              'timeSpent',
              'timeSpentPretty',
              'timestamp',
              'type',
              'url'},
             'download': {'icon',
              'pageId',
              'pageIdAction',
              'pageTitle',
              'serverTimePretty',
              'timeSpent',
              'timeSpentPretty',
              'timestamp',
              'type',
              'url'},
             'goal': {'goalId',
              'goalName',
              'goalPageId',
              'icon',
              'revenue',
              'serverTimePretty',
              'timestamp',
              'type',
              'url'},
             'outlink': {'icon',
              'pageId',
              'pageIdAction',
              'pageTitle',
              'serverTimePretty',
       

# Features used for search

In [54]:
substring = 'www.data.gouv.fr/fr/datasets/?'

searches = []
for filename in filenames:
    with open(filename, 'r') as f:
        data = json.load(f)
    print('.', end='')

    for i in range(len(data)):
        actions = data[i]['actionDetails']
        searches += [action['url'] for action in actions if action['type'] == 'action' and substring in action['url']]

.......................................................................................................................................................................................

In [55]:
len(searches)

137885

In [59]:
searches[0:10]

['http://www.data.gouv.fr/fr/datasets/?page=2',
 'http://www.data.gouv.fr/fr/datasets/?page=3',
 'http://www.data.gouv.fr/fr/datasets/?page=4',
 'http://www.data.gouv.fr/fr/datasets/?page=5',
 'http://www.data.gouv.fr/fr/datasets/?page=6',
 'http://www.data.gouv.fr/fr/datasets/?page=7',
 'http://www.data.gouv.fr/fr/datasets/?page=8',
 'http://www.data.gouv.fr/fr/datasets/?page=3',
 'http://www.data.gouv.fr/fr/datasets/?page=2',
 'https://www.data.gouv.fr/fr/datasets/?sort=-created&organization=534fff8ba3a7292c64a77ed4']

In [56]:
feature_count = defaultdict(int)
for search in searches:
    parsed = urllib.parse.urlparse(search)
    features_used = list(urllib.parse.parse_qs(parsed.query).keys())
    for feature in features_used:
        feature_count[feature] += 1

In [57]:
def print_feature_usage(feature_count):
    tuples = list(feature_count.items())
    sorted_tuples = sorted(tuples, key = lambda x: -x[1])
    for feature_name, count in sorted_tuples:
        print('{}\t{}'.format(count, feature_name))

In [58]:
print_feature_usage(feature_count)

66751	page
47193	sort
44803	organization
23953	tag
18371	format
15840	geozone
7964	granularity
3682	license
2500	reuses
1300	badge
1073	temporal_coverage
198	next
82	utm_content
34	org
12	territory
10	supplier
3	q
3	id
1	sap
1	cmpid
1	slug
1	h
1	d
1	lang
1	graselogernularity
1	ds
1	page_size
1	coid
1	mid
1	pr
1	v
1	a
1	pid


In [61]:
data[:50]

[{'actionDetails': [{'icon': None,
    'pageId': '43965477',
    'pageIdAction': '483889',
    'pageTitle': 'Me%20/%20Reuses',
    'serverTimePretty': 'Feb 16, 2017 20:21:04',
    'timeSpent': '0',
    'timeSpentPretty': '0s',
    'timestamp': 1487276464,
    'type': 'action',
    'url': 'https://www.data.gouv.fr/api/1/me/reuses/'},
   {'icon': None,
    'pageId': '43965478',
    'pageIdAction': '531718',
    'pageTitle': 'Spatial%20/%20Zones',
    'serverTimePretty': 'Feb 16, 2017 20:21:04',
    'timeSpent': '0',
    'timeSpentPretty': '0s',
    'timestamp': 1487276464,
    'type': 'action',
    'url': 'http://www.data.gouv.fr/api/1/spatial/zones/fr/county/74'},
   {'icon': None,
    'pageId': '43965479',
    'pageIdAction': '4524934',
    'pageTitle': 'Discussions',
    'serverTimePretty': 'Feb 16, 2017 20:21:04',
    'timeSpent': '11',
    'timeSpentPretty': '11s',
    'timestamp': 1487276464,
    'type': 'action',
    'url': 'https://www.data.gouv.fr/api/1/discussions/?for=583d971d