In [139]:
import requests
import json
import pandas as pd 

class PageFreezer:

    state_lookup = { -1: "Removal", 0: "Change", 1: "Addition" }
    
    def __init__(self,url_1, url_2, api_key = None):
        self.api_key = api_key
        self.url_1 = url_1 
        self.url_2 = url_2 
        self.run_query()
        self.parse_to_df()
        self.report()

    def report(self):
        print("Delta Score: ", self.query_result['delta_score'], " Number of changes: ",len(self.dataframe) )
        counts = self.dataframe.groupby('state').count()['old']
        counts.index = counts.index.to_series().map(self.state_lookup)
        print(counts)
        
    def run_query(self):
        result = requests.post( "https://api1.pagefreezer.com/v1/api/utils/diff/compare", 
                  data=json.dumps({"url1":self.url_1, "url2":self.url_2}) , 
                  headers= { "Accept": "application/json", "Content-Type": "application/json", "x-api-key": self.api_key})
        self.query_result = result.json()['result']
        
    def parse_to_df(self):        
        old=[]
        new=[]
        offset=[]
        state = []
        for diff in self.query_result['output']['diffs']:
            old.append(diff['old'])
            new.append(diff['new'])
            offset.append(diff['offset'])
            state.append(diff['change'])
        self.dataframe = pd.DataFrame({"old" : old, "new": new, "offset": offset, "state": state})
    
    def full_html_changes(self):
        from IPython.display import display, HTML
        display(HTML(a['output']['html']))
        return a['output']['html']
    
    def to_csv(self, filename):
        self.dataframe.to_csv(filename)
        
    def diff_pairs(self):
        diff_pairs = [(elem['new'], elem['old']) for elem in self.query_result['output']['diffs']]
        from IPython.display import display, HTML
        for pair in diff_pairs:
            display(HTML(pair[1]))
        

In [140]:
api_key = ''
pf = PageFreezer('https://raw.githubusercontent.com/edgi-govdata-archiving/pagefreezer-cli/master/archives/falsepos-num-views-a.html',
           'https://raw.githubusercontent.com/edgi-govdata-archiving/pagefreezer-cli/master/archives/falsepos-num-views-b.html',
                  api_key=api_key)

KeyError: 'result'

In [138]:
pf.dataframe

Unnamed: 0,new,offset,old,state
0,"Thursday,",13119,"Wednesday,",0
1,02,13215,01,0
2,(4116),89182,(4099),0


In [132]:
pf.dataframe

Unnamed: 0,new,offset,old,state
0,,143822,"\n <div class=""dataset-link...",-1


In [76]:
def get_PF_diff(url1,url2):
    a = requests.post( "https://api1.pagefreezer.com/v1/api/utils/diff/compare", 
                  data=json.dumps({"url1":url1, "url2":url2}) , 
                  headers= { "Accept": "application/json", "Content-Type": "application/json", "x-api-key": "SP949Hsfdm2z9rYbnb9mC588hO2uV3Nna2pcy1cj"})
    return a.json()

In [78]:
examples= {
    "truepos-dataset-removal",
    "falsepos-num-views",
    'falsepos-small-changes',
    'truepos-dataset-removal',
    ''
}

In [134]:
a = get_PF_diff('https://raw.githubusercontent.com/edgi-govdata-archiving/pagefreezer-cli/master/archives/falsepos-num-views-a.html',
           'https://raw.githubusercontent.com/edgi-govdata-archiving/pagefreezer-cli/master/archives/falsepos-num-views-b.html')

In [71]:
from IPython.core.display import HTML

with open('diff_output.html','w') as f :
    f.write(a['result']['output']['html'])

In [None]:
diff_pairs = [(elem['new'], elem['old']) for elem in d['result']['output']['diffs']]
from IPython.display import display, HTML
for pair in diff_pairs:
    display(HTML(pair[1]))

In [73]:
old=[]
new=[]
offset=[]
state = []
for diff in a['result']['output']['diffs']:
    old.append(diff['old'])
    new.append(diff['new'])
    offset.append(diff['offset'])
    state.append(diff['change'])
pd.DataFrame({"old" : old, "new": new, "offset": offset, "state": state})


Unnamed: 0,new,offset,old,state
0,,143822,"\n <div class=""dataset-link...",-1


In [13]:
with open('example.json','w') as f :
    f.write(a.text)
    

{'elapsed': 5.173136949539185,
 'result': {'delta_score': 0.23070517430637041,
  'input': {'diff_mode': 0,
   'doc1': 'http://apple.com/jp',
   'doc2': 'http://apple.com/kr',
   'html_output': 1,
   'mode': 'url',
   'snippet_chars': 200},
  'output': {'diffs': [{'change': 0,
     'new': '메뉴 열기',
     'offset': 1467,
     'old': 'メニューを開く'},
    {'change': 0, 'new': '메뉴 닫기', 'offset': 1719, 'old': 'メニューを閉じる'},
    {'change': 0, 'new': '쇼핑백', 'offset': 2374, 'old': 'ショッピングバッグ'},
    {'change': -1,
     'new': '',
     'offset': 3709,
     'old': '<li class="ac-gn-item ac-gn-item-menu ac-gn-tv">\n\t\t\t\t<a class="ac-gn-link ac-gn-link-tv" data-analytics-title="tv" href="/jp/tv/">\n\t\t\t\t\t<span class="ac-gn-link-text">TV</span>\n\t\t\t\t</a>\n\t\t\t</li>\n\t\t\t'},
    {'change': 0, 'new': '고객지원', 'offset': 4491, 'old': 'サポート'},
    {'change': 0,
     'new': 'apple.com 검색',
     'offset': 4893,
     'old': 'apple.comを検索'},
    {'change': 0, 'new': '쇼핑백', 'offset': 5300, 'old': 'ショッピングバ