# Preprocessing privacy and opt-out policies

In [81]:
# general imports
import warnings
import nltk
import pandas as pd
import numpy as np
import scipy as sp
from time import time, sleep
import json
import requests
import random
import os
import matplotlib.pyplot as plt
import re
from xml.etree import cElementTree as ET
from sklearn.externals import joblib
# Clean and lemmatize text
from nltk.corpus import stopwords
from gensim.utils import smart_open, simple_preprocess
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
import html5lib
import re
from bs4 import BeautifulSoup
import requests
from pprint import pprint
from pickle import dump,load
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

warnings.filterwarnings("ignore", category=DeprecationWarning)

random.seed(0)
%matplotlib inline

In [100]:
# Directory paths
pp_dir = '/Users/dbm/Downloads/OPP-115/sanitized_policies/'
oo_dir = '/Users/dbm/Downloads/OptOutChoice-2017_v1.0/SanitizedPrivacyPolicies/'

In [83]:
# Read in all story files
def ls_fullpath(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory)]

In [198]:
def extract_co_name(file, pat='\d+_|\.com|\.edu|\.html|\.xml|www_|_co'):
    co_name = re.sub(pattern=pat, repl='', string=os.path.basename(file))
    return co_name

In [183]:
# Read in files
pp_files = ls_fullpath(directory=pp_dir)
oo_files = ls_fullpath(directory=oo_dir)
oo_files = [i for i in oo_files if '.DS_Store' not in i]
# print(pp_xml_files)

In [137]:
# Extract company names from filenames
co_names_pp = [extract_co_name(file=file) for file in pp_files]
co_names_oo = [extract_co_name(file=file) for file in oo_files]
co_names_ppxml = [extract_co_name(file=file) for file in pp_xml_files]
co_names = co_names_pp + co_names_oo + co_names_ppxml

In [360]:
file = pp_xml_files[0]
doc_type = 'pp'
# df ={}
e = etree.parse(file)
text = [e.findall('.//SUBTEXT')[i].text for i in range(len(e.findall('.//SECTION')))]
text = [i for i in text if i is not None]
tmp = sent_tokenize(text[0])
tmp = ' '.join(tmp[1:])
tmp1 = [tmp] + text[1:]
' '.join(tmp1)

# sec_title = [e.findall('.//SUBTITLE')[i].text for i in range(len(e.findall('.//SECTION')))]
#     policy_doc = {'file':file, 'text': text, 'title': sec_title}    
# df = {'file':file, 'text': text, 'type':doc_type}
# df = pd.DataFrame(df)
# df = pd.DataFrame({'file':file, 'section':sec_title, 'text': text, 'type':doc_type})
# df
# text
# file
# pd.DataFrame.from_dict(df)

"When you share information with us, for example by creating a Google Account, we can make those services even better – to show you more relevant search results and ads, to help you connect with people or to make sharing with others quicker and easier. As you use our services, we want you to be clear how we're using information and the ways in which you can protect your privacy. Our Privacy Policy explains:\n\nWhat information we collect and why we collect it. How we use that information. The choices we offer, including how to access and update information. We've tried to keep it as simple as possible, but if you're not familiar with terms like cookies, IP addresses, pixel tags and browsers, then read about these key terms first. Your privacy matters to Google so whether you are new to Google or a long-time user, please do take the time to get to know our practices – and if you have any questions consult this page. We collect information to provide better services to all of our users –

In [361]:
# parse each xml file and generate a df
import xml.etree.ElementTree as etree
def xml_to_df(file, doc_type = 'pp'):
#     print(file)
    e = etree.parse(file)    
    text = [e.findall('.//SUBTEXT')[i].text for i in range(len(e.findall('.//SECTION')))]
    sec_title = [e.findall('.//SUBTITLE')[i].text for i in range(len(e.findall('.//SECTION')))]
    policy_doc = {'file':file, 'text': text, 'title': sec_title}    
    df = pd.DataFrame({'file':file, 'section':sec_title, 'text': text, 'type':doc_type})
    return df

def xml_to_df_nosec(file, doc_type = 'pp'):
#     print(file)
    e = etree.parse(file)
    text = [e.findall('.//SUBTEXT')[i].text for i in range(len(e.findall('.//SECTION')))]
    text = [i for i in text if i is not None]

# Filter out the first line which usually includes the dates and other info
    tmp = sent_tokenize(text[0])
    tmp = ' '.join(tmp[1:])    
    tmp1 = [tmp] + text[1:]
    tmp1 = ' '.join(tmp1)

#     Filter out the first sentence
    df = {'file':file,'text': tmp1, 'type':doc_type}
    return df

In [240]:
# Read in files and do preliminary preprocessing
def read_html(file, pat = '[^a-zA-z0-9.?!/ ]+', filt_len=6, doc_type='pp'):
    html_file = open(file, 'r', errors='ignore')
    source_code = html_file.read()
    soup = BeautifulSoup(source_code, 'html.parser')
    tmp = sent_tokenize(''.join(soup.findAll(text=True)))
    tmp_sent = [re.sub(pat, '', i).rstrip() for i in tmp]
    tmp_sent = [
        re.sub(pattern='[ \t]{2,}', repl=' ', string=i) for i in tmp_sent
        if len(i) > filt_len
    ]
    txt = ' '.join(tmp_sent)
#     named_entities.append(get_named_entities(txt))
    try:
        year = re.search(string=txt, pattern='20\d{2}').group()
    except AttributeError:
        year = ''  # apply your error handling
        type(year)
    data = {'file': file, 'year': year, 'doc_type': doc_type, 'text': txt}
    return data

In [27]:
# load spacy pretrained model for Named Entity Recognition
# nlp = spacy.load('en_core_web_sm')
# named_entities = []
# def get_named_entities(text):
#     doc = nlp(text)
#     # Get organization name from text
#     org_names = np.unique([ent.text for ent in doc.ents if ent.label_ == 'ORG'])
#     return org_names

In [88]:
# tokenize, lower case, and lemmatize words
def tokenize(series, stop_words, frequent_words):
    return (
        series
        .apply(lambda x: simple_preprocess(x))
        .apply(lambda tokens: [token for token in tokens if token not in stop_words])
        .apply(lambda tokens: [token for token in tokens if token not in frequent_words])
#         .apply(lambda tokens: [token for token in tokens if token not in get_named_entities(tokens)])
        .apply(lambda tokens: [wnl.lemmatize(token) for token in tokens])
        .apply(lambda tokens: [token for token in tokens if len(token) > 3])
    )

In [91]:
%%time
pp = [read_html(file=file, doc_type='pp') for file in pp_files]
oo = [read_html(file=file, doc_type='oo') for file in oo_files]

print(
    f'No. of Privacy policy Documents: {len(pp)} \nNo. of Opt out policy Documents: {len(oo)}'
)

No. of Privacy policy Documents: 115 
No. of Opt out policy Documents: 114
CPU times: user 2.4 s, sys: 47.7 ms, total: 2.45 s
Wall time: 2.53 s


In [92]:
# Combine data into dataframe
df = pd.DataFrame(pp+oo)
df.head(1)

Unnamed: 0,doc_type,file,text,year
0,pp,/Users/dbm/Downloads/OPP-115/sanitized_policie...,Privacy Policy Last Modified March 25 2013 Thi...,2013


In [36]:
%%time
# named_entities = [get_named_entities(i) for i in df['text']]
# named_entities = [i.split()for i in named_entities]

CPU times: user 4min 30s, sys: 35.4 s, total: 5min 6s
Wall time: 1min 18s


In [93]:
# Save dataset
dump(df, open("/Users/dbm/Documents/Insight S19/data/privacy_optout_policy.pkl", "wb"))
# dump(df, open("/Users/dbm/Documents/Insight S19/data/privacy_optout_policy_1.pkl", "wb"))

In [196]:
# Words to filter
stop_words = set(stopwords.words('english'))
frequent_words = [
    'privacy', 'profile', 'policy', 'andor', 'terms', 'service', 
    'please', 'valve','jibjab', 'steam', 'microsoft'
]
frequent_words = frequent_words + co_names
# Initialize lemmatizer
wnl = WordNetLemmatizer()

In [231]:
%%time
# Clean, lemmatize, and tokenize text
data_words = tokenize(df['text'], stop_words=stop_words,
                      frequent_words=frequent_words)
data_words = [' '.join(word) for word in data_words]
print(f'type: {type(data_words)}, len: {len(data_words)}')

type: <class 'list'>, len: 229
CPU times: user 17.7 s, sys: 32.6 ms, total: 17.7 s
Wall time: 17.7 s


In [232]:
data_words[0]



## ACL policy documents

In [None]:
# Process xml privacy files
pp_xml = '/Users/dbm/Downloads/corpus/'
pp_xml_files = ls_fullpath(directory=pp_xml)
pp_xml_files = [i for i in pp_xml_files if '.DS_Store' not in i]

In [299]:
# Parses out the xml and divides the document into sections
pp_xml_docs = [ ]
for file in pp_xml_files:
    parsed_file = xml_to_df(file, doc_type = 'pp')
    pp_xml_docs.append(parsed_file)

# Convert to dataframe
pp_xml_df = pd.concat(pp_xml_docs, axis = 0)    
pp_xml_df.head(2)

#  Filter out rows with missing text
print(pp_xml_df.info())
print(pp_xml_df.isna().sum())

pp_xml_df = pp_xml_df.reset_index(drop=True)
pp_xml_df.head(2)



<class 'pandas.core.frame.DataFrame'>
Int64Index: 10538 entries, 0 to 4
Data columns (total 4 columns):
file       10538 non-null object
section    9622 non-null object
text       10501 non-null object
type       10538 non-null object
dtypes: object(4)
memory usage: 411.6+ KB
None
file         0
section    916
text        37
type         0
dtype: int64


Unnamed: 0,file,section,text,type
0,/Users/dbm/Downloads/corpus/www_google_co_nz.xml,,"Privacy Policy\n\nLast modified: December 20, ...",pp
1,/Users/dbm/Downloads/corpus/www_google_co_nz.xml,Information we collect,We collect information to provide better servi...,pp


In [364]:
# Parses the document and retains the whole document agnostic to sections
pp_xml_full_docs = [xml_to_df_nosec(file, doc_type = 'pp') for file in pp_xml_files]
pp_xml_full_df = pd.DataFrame(pp_xml_full_docs)
dump(pp_xml_full_df, open("/Users/dbm/Documents/Insight S19/data/acl_privacy_policy_full_docs.pkl", "wb"))
pp_xml_full_df.head(2)

Unnamed: 0,file,text,type
0,/Users/dbm/Downloads/corpus/www_google_co_nz.xml,"When you share information with us, for exampl...",pp
1,/Users/dbm/Downloads/corpus/ibnlive_in.xml,Web18 recognizes the importance of protecting...,pp


In [365]:
# Clean, tokenize text
pp_full_doc_words = tokenize(pp_xml_full_df['text'], stop_words=stop_words,
                      frequent_words=frequent_words)

pp_full_doc_words = [' '.join(word) for word in pp_full_doc_words]
pp_full_doc_words
# print(f'type: {type(pp_full_doc_words)}, len: {len(pp_full_doc_words)}')
# dump(pp_data_words, open("/Users/dbm/Documents/Insight S19/data/acl_privacy_policy_full_doc_words.pkl", "wb"))

['share information example creating account make service even better show relevant search result help connect people make sharing others quicker easier service want clear using information protect explains information collect collect information choice offer including access update information tried keep simple possible familiar like cooky address pixel browser read first matter whether long time user take time know practice question consult page collect information provide better service user figuring basic stuff like language speak thing like find useful people matter online collect information information give example many service require sign account personal information like name email address telephone number credit card want take full advantage sharing feature offer might also create publicly visible include name photo information service collect information service like visit website advertising service view interact content information includes device information collect devi

In [278]:
len([word for word in word_tokenize(re.sub('[^a-zA-Z ]+','',pp_xml_df['text'][0])) if word not in stop_words])

106

In [257]:
# Filter out rows with missing text
# pp_xml_df[pp_xml_df['text'].isnull()]
pp_xml_df = pp_xml_df[pp_xml_df['text'].notnull()]
dump(pp_xml_df, open("/Users/dbm/Documents/Insight S19/data/acl_privacy_policy.pkl", "wb"))

In [261]:
print(pp_xml_df.shape)
# Filter out the Policy names
pp_df_text = pp_xml_df[pp_xml_df['section'].notnull()]
print(pp_df_text.shape)

# Retain policy id
pp_df_name = pp_xml_df[pp_xml_df['section'].isnull()]
print(pp_df_name.shape)

(10501, 5)
(9585, 5)
(916, 5)


In [262]:
pp_df_text.head(2)

Unnamed: 0,index,file,section,text,type
1,1,/Users/dbm/Downloads/corpus/www_google_co_nz.xml,Information we collect,We collect information to provide better servi...,pp
2,2,/Users/dbm/Downloads/corpus/www_google_co_nz.xml,How we use information we collect,We use the information we collect from all of ...,pp


In [263]:
## Save dataset
dump(pp_df_text, open("/Users/dbm/Documents/Insight S19/data/privacy_policy_acl_text.pkl", "wb"))
dump(pp_df_name, open("/Users/dbm/Documents/Insight S19/data/privacy_policy_acl_title.pkl", "wb"))
# dump(data_words, open("/Users/dbm/Documents/Insight S19/data/privacy_optout_policy_cleaned_1.pkl", "wb"))

In [266]:
# Add the company names to frequent words
co_names_acl = pp_df_text['file'].apply(lambda x: extract_co_name(x)).unique()
co_names_acl = co_names_acl.tolist()
frequent_words = frequent_words + co_names_acl
frequent_words = list(set(frequent_words))
print(len(frequent_words))
frequent_words

1122


['spanishdict',
 'irishtimes',
 'legal_us_ubm',
 'abcnews.go',
 'easybib',
 'dictionary.reference',
 'scrippsnetworksinteractive',
 'tinyurl',
 'gap',
 'gamespress',
 'cryptsy',
 'help_ladbrokes',
 'oxforddictionaries',
 'bhphotovideo',
 'miniclip',
 'neopets',
 'petfinder',
 'business-standard',
 'security_wmtransfer',
 'utexas_edu',
 'statcounter',
 'livejournal',
 'travelocity',
 '6pm',
 'nlm_nih_gov',
 'usairways',
 'eventbrite',
 'yahoo',
 'consumerreports_org',
 'dslreports',
 'ted',
 'costco',
 'subscription_timeinc',
 'scientificamerican',
 'uploaded_net',
 'xscores',
 'addictinggames',
 'yale_edu',
 'med_nyu_edu',
 'cosmopolitan',
 'earthkam.org',
 'curezone_org',
 'apa_org',
 'classified_bdnews24',
 'foreignpolicy',
 'biomedcentral',
 'ubergizmo',
 'mmajunkie',
 'foxsports',
 'ypg',
 'jcpenney',
 'boxingscene',
 'virtualtourist',
 'discogs',
 'edmunds',
 'facebook',
 'ironhorsevineyards',
 'lonelyplanet',
 'wikitravel.org',
 'emirates',
 'merriam-webster',
 'metatalk_metafilt

In [267]:
# Clean, tokenize text
pp_data_words = tokenize(pp_df_text['text'], stop_words=stop_words,
                      frequent_words=frequent_words)

In [268]:
pp_data_words = [' '.join(word) for word in pp_data_words]
pp_data_words
print(f'type: {type(pp_data_words)}, len: {len(pp_data_words)}')


type: <class 'list'>, len: 9585


In [269]:
pp_data_words

['collect information provide better service user figuring basic stuff like language speak thing like find useful people matter online collect information information give example many service require sign account personal information like name email address telephone number credit card want take full advantage sharing feature offer might also create publicly visible include name photo information service collect information service like visit website advertising service view interact content information includes device information collect device specific information hardware model operating system version unique device identifier mobile network information including phone number associate device identifier phone number account information service view content provided automatically collect store certain information server include detail used search query telephony information like phone number calling party number forwarding number time date call duration call routing information type

In [270]:
dump(pp_data_words, open("/Users/dbm/Documents/Insight S19/data/acl_privacy_policy_words.pkl", "wb"))

In [178]:
##### References
# - Inproceedings (ramanath:2014,
#     author = {Rohan Ramanath and Fei Liu and Norman Sadeh and Noah Smith},
#     booktitle = {Proceedings of ACL},
#     month = {June},
#     publisher = {Association for Computational Linguistics},
#     title = {Unsupervised Alignment of Privacy Policies using Hidden Markov Models},
#     year = {2014})