In [5]:
import json
import urllib.request
from io import StringIO
from html.parser import HTMLParser
import pandas as pd

In [6]:
class MLStripper(HTMLParser): #Strips HTML from our downloaded text.  From stackoverflow
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()
def strip_tags(html): #Strips HTML from our downloaded text.  From stackoverflow
    s = MLStripper()
    s.feed(html)
    return s.get_data()

#initialize needed variables
policy_list = []
policy_dict = {}

#make our subset to flag used applications for MSDS as we come across them
MSDS_use = ['Slack','Kaltura','Microsoft','Google','Youtube','Trello']
MSDS_use_flag = 0

#these data points are sometimes empty, override NoneType error
#with safety net defaults
privacy_stmt_count = 0
policy_text = None

#real deal starting point
#cur_pg = 'https://api.usableprivacy.org/websites/'

#movable testing starting point for faster runtime
cur_pg = 'https://api.usableprivacy.org/websites/?page=1180'

#while loop will continue as long as the page has a 'next' value
while cur_pg:  
    #pull in our page
    load = json.load(urllib.request.urlopen(cur_pg))
    
    #we have these 6 indicies as a pattern on all our API pages
    for i in [0,1,2,3,4,5]:
        #use try loop to make sure we keep going and just print out trouble spots in the except
        try:
            #code MSDS flag as 1 if we've got something
            if (load['results'][i]['title'] in MSDS_use):
                MSDS_use_flag = 1
            
            #default to 0 if no privacy statement count if empty
            if not(load['results'][i]['num_privacy_statements'][0]):
                privacy_stmt_count = 0
            else:
                privacy_stmt_count = load['results'][i]['num_privacy_statements'][0]
            
            #default to 'No policy text available' if empty
            if not(load["results"][i]["fine_grained_policy"]):
                policy_text = 'No policy text available'
            else:
                policy_text = strip_tags(load['results'][i]["fine_grained_policy"]["text"])
                policy_text = policy_text.replace("|", "")
                policy_text = " ".join(policy_text.split()) #strip all the extra white space
            
            #remove any data points we can't use - unusable word counts
            if load['results'][i]['policy_word_count'] > 0:
            #fill a holder dictionary with all the needed info
                policy_dict = {'Site URL':load['results'][i]['url'], 'Site Title':load['results'][i]['title'],'Policy Word Count':load['results'][i]['policy_word_count'], 'Privacy Statements Count':privacy_stmt_count, 'MSDS Use Flag':MSDS_use_flag, 'API URL':cur_pg, 'Policy Text':policy_text}
            
            
                #add the dictionary to our list
                policy_list.append(policy_dict)
        
        #let the loop keep going and just print out where failures occur
        except:
            print("error at " + cur_pg + "  index " + str(i))
        
            #reset our flag for next result
            MSDS_use_flag = 0
    #after looping through our results, get the next page. If empty, this breaks our while
    cur_pg = load['next']

error at https://api.usableprivacy.org/websites/?page=1190  index 3
error at https://api.usableprivacy.org/websites/?page=1194  index 5
error at https://api.usableprivacy.org/websites/?page=1195  index 4
error at https://api.usableprivacy.org/websites/?page=1195  index 5


In [7]:
#with our complete list, convert it into a dataframe
policy_df = pd.DataFrame(policy_list)

#dump into a csv
#policy_df.to_csv('privacy policies.csv')

#take a look at the data
policy_df.head()
policy_df.tail()

#my Spyder won't give me all the columns (just returns a ... inbetween the first and last) so I print out subsets here
policy_df.iloc[:,0:2].tail()
policy_df.iloc[:,2:4].tail()
policy_df.iloc[:,4:6].tail()

Unnamed: 0,MSDS Use Flag,API URL
87,0,https://api.usableprivacy.org/websites/?page=1194
88,0,https://api.usableprivacy.org/websites/?page=1195
89,0,https://api.usableprivacy.org/websites/?page=1195
90,0,https://api.usableprivacy.org/websites/?page=1195
91,0,https://api.usableprivacy.org/websites/?page=1195


In [8]:
#now with all our data downloaded, we clean up and do some data transformations
policy_df['Read Time(min)'] = round(policy_df['Policy Word Count']/240,2) #average wpm is 240
policy_df.head()

Unnamed: 0,Site URL,Site Title,Policy Word Count,Privacy Statements Count,MSDS Use Flag,API URL,Policy Text,Read Time(min)
0,libreoffice.org,Libreoffice,1639,12,0,https://api.usableprivacy.org/websites/?page=1180,No policy text available,6.83
1,insideedition.com,Insideedition,4033,38,0,https://api.usableprivacy.org/websites/?page=1180,No policy text available,16.8
2,your-surveys.com,Your-Surveys,2088,19,0,https://api.usableprivacy.org/websites/?page=1180,No policy text available,8.7
3,playboyplus.com,Playboyplus,2296,39,0,https://api.usableprivacy.org/websites/?page=1180,No policy text available,9.57
4,bookrags.com,Bookrags,1013,11,0,https://api.usableprivacy.org/websites/?page=1180,No policy text available,4.22


In [15]:
policy_df[(policy_df['Policy Word Count']<0)]

Unnamed: 0,Site URL,Site Title,Policy Word Count,Privacy Statements Count,MSDS Use Flag,API URL,Policy Text,Read Time(min)
