In [3]:
import requests
import json
import pandas as pd 

class PageFreezer:

    state_lookup = { -1: "Removal", 0: "Change", 1: "Addition" }
    
    def __init__(self,url_1, url_2, api_key = None):
        self.api_key = api_key
        self.url_1 = url_1 
        self.url_2 = url_2 
        self.run_query()
        self.parse_to_df()
        self.report()

    def report(self):
        print("Delta Score: ", self.query_result['delta_score'], " Number of changes: ",len(self.dataframe) )
        counts = self.dataframe.groupby('state').count()['old']
        counts.index = counts.index.to_series().map(self.state_lookup)
        print(counts)
        
    def run_query(self):
        result = requests.post( "https://api1.pagefreezer.com/v1/api/utils/diff/compare", 
                  data=json.dumps({"url1":self.url_1, "url2":self.url_2}) , 
                  headers= { "Accept": "application/json", "Content-Type": "application/json", "x-api-key": self.api_key})
        self.query_result = result.json()['result']
        
    def parse_to_df(self):        
        old=[]
        new=[]
        offset=[]
        state = []
        for diff in self.query_result['output']['diffs']:
            old.append(diff['old'])
            new.append(diff['new'])
            offset.append(diff['offset'])
            state.append(diff['change'])
        self.dataframe = pd.DataFrame({"old" : old, "new": new, "offset": offset, "state": state})
    
    def full_html_changes(self):
        from IPython.display import display, HTML
        display(HTML(self.query_result['output']['html']))
        return self.query_result['output']['html']
    
    def to_csv(self, filename):
        self.dataframe.to_csv(filename)
        
    def diff_pairs(self):
        diff_pairs = [(elem['new'], elem['old']) for elem in self.query_result['output']['diffs']]
        from IPython.display import display, HTML
        for pair in diff_pairs:
            display(HTML(pair[1]))
        

In [4]:
import os
api_key = os.environ['PAGE_FREEZER_API_KEY']
pf = PageFreezer('https://raw.githubusercontent.com/edgi-govdata-archiving/web-monitoring/master/example-data/falsepos-num-views-a.html','https://raw.githubusercontent.com/edgi-govdata-archiving/web-monitoring/master/example-data/falsepos-num-views-b.html',api_key)

('Delta Score: ', 0.000130324573349928, ' Number of changes: ', 3)
state
Change    3
Name: old, dtype: int64


In [5]:
pf.dataframe

Unnamed: 0,new,offset,old,state
0,"Thursday,",13119,"Wednesday,",0
1,02,13215,01,0
2,(4116),89182,(4099),0


In [6]:
pf.dataframe

Unnamed: 0,new,offset,old,state
0,"Thursday,",13119,"Wednesday,",0
1,02,13215,01,0
2,(4116),89182,(4099),0


In [7]:
def get_PF_diff(url1,url2):
    a = requests.post( "https://api1.pagefreezer.com/v1/api/utils/diff/compare", 
                  data=json.dumps({"url1":url1, "url2":url2}) , 
                  headers= { "Accept": "application/json", "Content-Type": "application/json", "x-api-key": os.environ['PAGE_FREEZER_API_KEY']})
    return a.json()

In [8]:
examples= {
    "truepos-dataset-removal",
    "falsepos-num-views",
    'falsepos-small-changes',
    'truepos-dataset-removal',
    ''
}

In [9]:
a = get_PF_diff('https://raw.githubusercontent.com/edgi-govdata-archiving/web-monitoring/master/example-data/falsepos-num-views-a.html',
           'https://raw.githubusercontent.com/edgi-govdata-archiving/web-monitoring/master/example-data/falsepos-num-views-b.html')

In [10]:
from IPython.core.display import HTML

with open('diff_output.html','w') as f :
    f.write(a['result']['output']['html'].encode('utf-8'))

In [11]:
diff_pairs = [(elem['new'], elem['old']) for elem in a['result']['output']['diffs']]
from IPython.display import display, HTML
for pair in diff_pairs:
    display(HTML(pair[1]))

In [12]:
old=[]
new=[]
offset=[]
state = []
for diff in a['result']['output']['diffs']:
    old.append(diff['old'])
    new.append(diff['new'])
    offset.append(diff['offset'])
    state.append(diff['change'])
pd.DataFrame({"old" : old, "new": new, "offset": offset, "state": state})


Unnamed: 0,new,offset,old,state
0,"Thursday,",13119,"Wednesday,",0
1,02,13215,01,0
2,(4116),89182,(4099),0


In [13]:
import json
with open('example.json','w') as f :
    #f.write(a.encode('utf-8'))
    json.dump(a,f)
    