# Preprocessing privacy and opt-out policies

In [81]:
# general imports
import warnings
import nltk
import pandas as pd
import numpy as np
import scipy as sp
from time import time, sleep
import json
import requests
import random
import os
import matplotlib.pyplot as plt
import re
from xml.etree import cElementTree as ET
from sklearn.externals import joblib
# Clean and lemmatize text
from nltk.corpus import stopwords
from gensim.utils import smart_open, simple_preprocess
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
import html5lib
import re
from bs4 import BeautifulSoup
import requests
from pprint import pprint
from pickle import dump,load
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

warnings.filterwarnings("ignore", category=DeprecationWarning)

random.seed(0)
%matplotlib inline

In [100]:
# Directory paths
pp_dir = '/Users/dbm/Downloads/OPP-115/sanitized_policies/'
oo_dir = '/Users/dbm/Downloads/OptOutChoice-2017_v1.0/SanitizedPrivacyPolicies/'
pp_xml = '/Users/dbm/Downloads/corpus/'

In [83]:
# Read in all story files
def ls_fullpath(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory)]

In [110]:
def extract_co_name(file, pat='\d+_|\.com|\.edu|\.html|\.xml|www_|_co'):
    co_name = re.sub(pattern=pat, repl='', string=os.path.basename(file))
    return co_name

In [127]:
# Read in files
pp_files = ls_fullpath(directory=pp_dir)
oo_files = ls_fullpath(directory=oo_dir)
oo_files = [i for i in oo_files if '.DS_Store' not in i]
pp_xml_files = ls_fullpath(directory=pp_xml)
print(pp_xml_files)

['/Users/dbm/Downloads/corpus/www_google_co_nz.xml', '/Users/dbm/Downloads/corpus/ibnlive_in.xml', '/Users/dbm/Downloads/corpus/www_gocomics.xml', '/Users/dbm/Downloads/corpus/www_petsmart.xml', '/Users/dbm/Downloads/corpus/www_duolingo.xml', '/Users/dbm/Downloads/corpus/www_usda_gov.xml', '/Users/dbm/Downloads/corpus/www_fcbarcelona.xml', '/Users/dbm/Downloads/corpus/static_zara_net.xml', '/Users/dbm/Downloads/corpus/www_infowars.xml', '/Users/dbm/Downloads/corpus/www_change_org.xml', '/Users/dbm/Downloads/corpus/www_uefa.xml', '/Users/dbm/Downloads/corpus/www_washingtontimes.xml', '/Users/dbm/Downloads/corpus/english_alarabiya_net.xml', '/Users/dbm/Downloads/corpus/www_esquire.xml', '/Users/dbm/Downloads/corpus/www_netdoctor_co_uk.xml', '/Users/dbm/Downloads/corpus/gamespress.xml', '/Users/dbm/Downloads/corpus/www_flashscore.xml', '/Users/dbm/Downloads/corpus/www_abovetopsecret.xml', '/Users/dbm/Downloads/corpus/www_breitbart.xml', '/Users/dbm/Downloads/corpus/www_soccervista.xml', '

In [112]:
# Extract company names from filenames
co_names_pp = [extract_co_name(file=file) for file in pp_files]
co_names_oo = [extract_co_name(file=file) for file in oo_files]
co_names_ppxml = [extract_co_name(file=file) for file in pp_xml_files]
co_names = co_names_pp + co_names_oo
co_names_ppxml

['google_nz',
 'ibnlive_in',
 'gocomics',
 'petsmart',
 'duolingo',
 'usda_gov',
 'fcbarcelona',
 'static_zara_net',
 'infowars',
 'change_org',
 'uefa',
 'washingtontimes',
 'english_alarabiya_net',
 'esquire',
 'netdoctor_uk',
 'gamespress',
 'flashscore',
 'abovetopsecret',
 'breitbart',
 'soccervista',
 'chillingeffects_org',
 'accorhotels',
 'livingsocial',
 'cvs',
 'scout',
 'fotolia',
 'opentable',
 'king',
 'petmd',
 'about',
 'theverge',
 'microsoft',
 'drugstore',
 'saksfifthavenue',
 'oracle',
 'cars',
 'care',
 'aeropostale',
 'mozilla_org',
 'mayoclinic',
 'companieshouse_gov_uk',
 'chron',
 'dynamicdrive',
 'egotastic',
 'cambridge_org',
 'cms_gov',
 'curezone_org',
 'avg',
 'cbc_ca',
 'earthclinic',
 'weather',
 'pearsoned',
 'utm_utoronto_ca',
 'couchsurfing_org',
 'wikia',
 'travelzoo',
 'linkedin',
 'mtgox',
 'biblehub',
 'bounty',
 'staples',
 'edmunds',
 'nlm_nih_gov',
 'pro_bitcoincharts',
 'bitpay',
 'freelancer',
 'restaurant',
 'ford',
 'wwwepa_gov',
 'gm',
 'fa

In [132]:
# parse each xml file and generate a df
import xml.etree.ElementTree as etree
def xml_to_df(file, doc_type = 'pp'):
#     print(file)
    e = etree.parse(file)
    text = [e.findall('.//SUBTEXT')[i].text for i in range(len(e.findall('.//SECTION')))]
    sec_title = [e.findall('.//SUBTITLE')[i].text for i in range(len(e.findall('.//SECTION')))]
    policy_doc = {'file':file, 'text': text, 'title': sec_title}    
    df = pd.DataFrame({'file':file, 'section':sec_title, 'text': text, 'type':doc_type})
    return df


In [134]:
# Process xml privacy files
pp_xml_docs = [ ]
for file in pp_xml_files:
    parsed_file = xml_to_df(file, doc_type = '')
    pp_xml_docs.append(parsed_file)

pp_xml_df = pd.concat(pp_xml_docs, axis = 0)    
# pp_xml_docs = pd.concat(pp_xml_df)
pp_xml_df = pp_xml_df.reset_index()
pp_xml_df.head(15)
dump(df, open("/Users/dbm/Documents/Insight S19/data/acl_privacy_policy.pkl", "wb"))

In [90]:
# Read in files and do preliminary preprocessing
def read_html(file, pat = '[^a-zA-z0-9.?!/ ]+', filt_len=6, doc_type='pp'):
    html_file = open(file, 'r', errors='ignore')
    source_code = html_file.read()
    soup = BeautifulSoup(source_code, 'html.parser')
    tmp = sent_tokenize(''.join(soup.findAll(text=True)))
    tmp_sent = [re.sub(pat, '', i).rstrip() for i in tmp]
    tmp_sent = [
        re.sub(pattern='[ \t]{2,}', repl=' ', string=i) for i in tmp_sent
        if len(i) > filt_len
    ]
    txt = ' '.join(tmp_sent)
#     named_entities.append(get_named_entities(txt))
    try:
        year = re.search(string=txt, pattern='20\d{2}').group()
    except AttributeError:
        year = ''  # apply your error handling
        type(year)
    data = {'file': file, 'year': year, 'doc_type': doc_type, 'text': txt}
    return data

In [27]:
# load spacy pretrained model for Named Entity Recognition
# nlp = spacy.load('en_core_web_sm')
# named_entities = []
# def get_named_entities(text):
#     doc = nlp(text)
#     # Get organization name from text
#     org_names = np.unique([ent.text for ent in doc.ents if ent.label_ == 'ORG'])
#     return org_names

In [88]:
# tokenize, lower case, and lemmatize words
def tokenize(series, stop_words, frequent_words):
    return (
        series
        .apply(lambda x: simple_preprocess(x))
        .apply(lambda tokens: [token for token in tokens if token not in stop_words])
        .apply(lambda tokens: [token for token in tokens if token not in frequent_words])
#         .apply(lambda tokens: [token for token in tokens if token not in get_named_entities(tokens)])
        .apply(lambda tokens: [wnl.lemmatize(token) for token in tokens])
        .apply(lambda tokens: [token for token in tokens if len(token) > 3])
    )

In [91]:
%%time
pp = [read_html(file=file, doc_type='pp') for file in pp_files]
oo = [read_html(file=file, doc_type='oo') for file in oo_files]

print(
    f'No. of Privacy policy Documents: {len(pp)} \nNo. of Opt out policy Documents: {len(oo)}'
)

No. of Privacy policy Documents: 115 
No. of Opt out policy Documents: 114
CPU times: user 2.4 s, sys: 47.7 ms, total: 2.45 s
Wall time: 2.53 s


In [92]:
# Combine data into dataframe
df = pd.DataFrame(pp+oo)
df.head(1)

Unnamed: 0,doc_type,file,text,year
0,pp,/Users/dbm/Downloads/OPP-115/sanitized_policie...,Privacy Policy Last Modified March 25 2013 Thi...,2013


In [36]:
%%time
# named_entities = [get_named_entities(i) for i in df['text']]
# named_entities = [i.split()for i in named_entities]

CPU times: user 4min 30s, sys: 35.4 s, total: 5min 6s
Wall time: 1min 18s


In [93]:
# Save dataset
dump(df, open("/Users/dbm/Documents/Insight S19/data/privacy_optout_policy_1.pkl", "wb"))

In [94]:
# Words to filter
stop_words = set(stopwords.words('english'))
frequent_words = [
    'privacy', 'policy', 'andor', 'terms', 'service', 
    'please', 'valve','jibjab', 'steam', 'microsoft'
]
frequent_words.append(co_names)
# Initialize lemmatizer
wnl = WordNetLemmatizer()

In [95]:
%%time
# Clean, lemmatize, and tokenize text
data_words = tokenize(df['text'], stop_words=stop_words,
                      frequent_words=frequent_words)
data_words = [' '.join(word) for word in data_words]
print(f'type: {type(data_words)}, len: {len(data_words)}')

type: <class 'list'>, len: 229
CPU times: user 4.03 s, sys: 108 ms, total: 4.13 s
Wall time: 4.17 s


In [96]:
data_words[0]



In [97]:
## Save dataset
dump(data_words, open("/Users/dbm/Documents/Insight S19/data/privacy_optout_policy_cleaned_1.pkl", "wb"))

In [136]:
##### References
- Inproceedings (ramanath:2014,
    author = {Rohan Ramanath and Fei Liu and Norman Sadeh and Noah Smith},
    booktitle = {Proceedings of ACL},
    month = {June},
    publisher = {Association for Computational Linguistics},
    title = {Unsupervised Alignment of Privacy Policies using Hidden Markov Models},
    year = {2014})




SyntaxError: invalid syntax (<ipython-input-136-a678a33de3c7>, line 2)