# Example for using Data Loading API

This API loads data from the latest [release](https://github.com/cliqz-oss/privacy-bot/releases) dump.

In [1]:
# Create a policies instance
from privacy_bot.analysis.policies_snapshot_api import Policies
from collections import Counter

policies = Policies()

Get Latest Release
--------------------------------------------------------------------------------
Name: Raw privacy policies sample (Among top 1000)
Tag: 01-06-2017_v2_sample
Url: https://api.github.com/repos/cliqz-oss/privacy-bot/zipball/01-06-2017_v2_sample
--------------------------------------------------------------------------------
Load cached content
Load archive


### Policy object properties

In [2]:
# policy.html            ---    html format
# policy.text            ---    text format
# policy.domain          ---    domain
# policy.lang            ---    lang
# policy.tld             ---    top level domain

### Iterate on all policies

In [3]:
c = Counter()

# Iterate on all policies
for policy in policies:
    # count policies by language
    c[policy.lang] += 1

print(c)

Counter({'de': 603, 'en': 124, 'bn': 10, '': 3, 'pl': 2, 'pt': 1, 'it': 1, 'ru': 1})


In [4]:
# Similarly we can access some meta info on policies in the data

print("DOMAINS: ", policies.domains)
print("-------------------------")
print("TLDs: ", policies.tlds)
print("-------------------------")
print("LANGUAGES: ", policies.languages)


DOMAINS:  {'barclaycard.de', 'serienjunkies.de', 'zentrum-der-gesundheit.de', 'fastenergy.de', 't-online.de', 'kn-online.de', 'xxxlshop.de', 'schuhcenter.de', 'blablacar.de', 'gamestar.de', 'tipp24.com', 'deutsche-bank.de', 'adidas.de', 'n24.de', 'kalaydo.de', 'whatsapp.com', 'youtube.com', 'eplus.de', 'momox.de', 'tvtoday.de/tv-programm', 'newtopia.de', 'klingel.de', 'de.sputniknews.com', 'toysrus.de', 'friendscout24.de', 'myfreefarm.de', 'easports.com', 'moviepilot.de', 'dhl.de/de.html', 'haz.de', 'pearl.de', 'de.silvergames.com', 'eurosport.de', 'pcwelt.de', 'bwin.com', 'preis24.de', 'nordbayern.de', 'ford.de', 'jochen-schweizer.de', 'kicker.de', 'csgolounge.com', 'hartziv.org', 'weltbild.de', 'clipfish.de', 'kika.de/index.html', 'ask.fm', 'office-discount.de', 'taz.de', 'giga.de', 'bravo.de', 'bbbank.de', 'ryanair.com', 'lolking.net', 'spotify.com/de', 'derstandard.at', 'uni-bonn.de', 'penny.de', 'gelbeseiten.de', 'google.at', 'stepstone.de', 'faz.net', 'pc-magazin.de', 'peek-clopp

### Iterate policies in a specific language

In [5]:
# Counter of top level domains
tlds = Counter()

#loading only policies in english
for policy in policies.query(lang='en', domain=None, tld=None):
    tlds[policy.tld] += 1

print(tlds)

Counter({'com': 94, 'net': 10, 'org': 5, 'de': 3, 'eu': 3, 'tv': 2, 'am': 2, 'io': 1, 'fm': 1, 'co.uk': 1, 'ie': 1, 'me': 1})


The `query` method of `Policies` enables filtering by `domain`, `language`, `tld`.

### Policies of company or specific domain

In [6]:
# We can retrieve available policies of a given domain or company
list(policy.domain for policy in policies.query(domain='google'))

['google.at', 'google.ch', 'google.de', 'google.pl', 'google.ru']

In [7]:
# Accessing a particular policy
google = next(policies.query(domain='google.de'))



def fix_encoding(content):
    return content.encode('latin-1').decode('utf-8')


# first 300 characters
fix_encoding(google.text[:300])

'Datenschutzerklärung – Datenschutzerklärung &amp; Nutzungsbedingungen –\nGoogle\n\n#\n\n##  Datenschutzerklärung &amp; Nutzungsbedingungen\n\nWeiter zum Inhalt\n\n  * Übersicht\n  * Datenschutzerklärung\n  * Nutzungsbedingungen\n  * Technologien und Prinzipien\n  * Häufig gestellte Fragen\n  * Mein Kont'