In [75]:
import requests
import bs4
import difflib
import time
import sqlite3
import json
c = sqlite3.connect('TOS.sqlite')

def __setup():
    c.execute("""drop table if exists company""")
    c.execute("""drop table if exists tos_text""")
    c.execute("""
    CREATE TABLE IF NOT EXISTS company(
        id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
        url TEXT,
        name TEXT,
        last_scan INT,
        last_error INT,
        scan_instructions TEXT,
        status TEXT
    )
    """)

    c.execute("""
    CREATE TABLE IF NOT EXISTS tos_text(
        id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
        company_id INT,
        start_date INT,
        end_date INT,
        text BLOB,
        formatted_text BLOB,
        delta BLOB,
        formatted_delta BLOB
    )
    """)
    c.execute("""
    CREATE INDEX IF NOT EXISTS tos_text_company_idx ON tos_text(company_id)
    """)
    c.commit()
    id_ = create_company("STEIN TEST LOCAL", "http://0.0.0.0:8999/www", {})
    scan_company_tos(id_)

def iterative_diff_function(a, b):
    difflib._check_types(a, b, '','','','','\n')
    for tag, i1, i2, j1, j2 in difflib.SequenceMatcher(lambda x: x==' ',a,b).get_opcodes():
        if tag == "equal":
            yield (tag, a[i1:i2])
        if tag == "delete":
            yield (tag, a[i1:i2])
        if tag == "insert":
            yield (tag, b[j1:j2])
        if tag == "replace":
            yield (tag, a[i1:i2], b[j1:j2])

            
def diff_function(a, b):
    return list(iterative_diff_function(a, b))


_diff_function = diff_function


def do_update_check(company_id):
    old_tos = lookup_TOS(company_id)
    url = lookup_URL(company_id)
    new_tos = pull_TOS(url)
    if old_tos:
        text_diff = list(_diff_function(
            old_tos['text'],
            new_tos['text'],
        ))
        format_diff = list(_diff_function(
            old_tos['formatted_text'],
            new_tos['formatted_text'],
        ))
        if len(text_diff) == len(format_diff) == 1:
            if (text_diff[0][0] == format_diff[0][0] == "equal"):
                return None
        return {
            "text": new_tos["text"],
            "formatted_text": new_tos["formatted_text"],
            "delta": text_diff,
            "formatted_delta": format_diff,
            "new": False,
        }
    else:
        return {
            "text": new_tos["text"],
            "formatted_text": new_tos["formatted_text"],
            "new": True
        }

    
def pull_TOS(url):
    r = requests.get(url, timeout=10)
    assert r.status_code == 200, 'error, code: ' + r.status_code
    soup = bs4.BeautifulSoup(r.text)
    return {
        "text": soup.body.text,  #"\n\n".join(p.text for p in ps),
        "formatted_text": str(soup.body),     #"\n\n".join(str(p) for p in ps),
    }


def scan_company_tos(company_id):
    url = lookup_URL(company_id)
    tos = pull_TOS(url)
    lookup_TOS(company_id)
    update_result = do_update_check(company_id)
    if not update_result:
        print("no change")
        pass # nullop
    else:
        add_TOS(
            company_id, 
            update_result["text"], 
            update_result["formatted_text"], 
            update_result.get("delta"),
            update_result.get("formatted_delta"),
            int(time.time())
       )
    # TODO: log error and update status to show most recent error


In [76]:
if False:
    create_company("_local test", "http://0.0.0.0:8999/www/", {})
    print('\n'.join(c.execute("select * from company limit 20;")))

In [82]:
##__setup()
##scan_company_tos(1)

In [49]:
def list_companies():
    company_list = c.execute("select * from company")
    to_ret = []
    for c in company_list:
        to_ret.append({
            "id": c[0],
            "url": c[1],
            "name": c[2],
            "last_scan": c[3],
            "last_error": c[4],
            "scan_instructions": c[5],
            "status": c[6],
        })
    return to_ret

def update_company(company_id, url=None, name=None, scan_instructions=None):
    if url:
        c.execute("update company where id=? set url=?", (company_id, url))
    if name:
        c.execute("update company where id=? set name=?", (company_id, name))
    if scan_instructions:
        c.execute(
            "update company where id=? set scan_instructions=?", 
            (company_id, json.dumps(scan_instructions))
        )
    c.commit()
    return c.execute("select * from company where id=?", (company_id, )).fetchone()

def get_diff(company_id, start_time, end_time):
    pass

# Pages

## Overview

- String Filter
- Time Filter

- Table of companies
    + Name
    + url
    + Last update
    + number of changes by year
    + edit button


## Main Changes View

- String Filter
- Start time scrubber
- End time scrubber

- List of changes by company
    - left / right, just changes and surroundings


## Company page

- select left pane
- select right pane
- go to company setting page


## Company settings

- change URL
- change monitored portion


## Add Company page

- name
- url
- test page


```
from flask import Flask
app = Flask(__name__, static_folder, templates_folder)

def path(*_,**__):
    def inner(fn):
        return fn
    return inner


#####################################
## Browsing and exploring the data ##
#####################################

@path("/")
def overview():
    """ list all companies """
    pass

# date range: start_date < target, end_date > target OR NOT end_date

@path("/changes")
def change_browser():
    """ list all companies and the isolated deltas for a selected range """
    pass

@path("/changes/all/{fromdate}/{todate}")
def delta_list(fromdate, todate):
    """ return the deltas of all companies in a date range """
    pass

@path("/changes/{id}/{fromdate}/{todate}")
def delta_list(id, fromdate, todate):
    """ return the deltas of a selected in a date range """
    pass

@path("/company/{id}")
def company_view(id):
    """ show company, all available updates, and a delta viewer."""
    pass

##############################################################
## Consider merging these and checkign the method in the fn ##
##############################################################

@path("/company/{id}/edit", method="POST")
def update_company(id):
    """ update a company's settings """
    pass

@path("/company/{id}/edit")
def update_company_view(id):
    """ the form that goes withthe update_company"""
    pass

@path("/company/new", method="POST")
def create_company(id):
    """ create a new company """
    pass

@path("/company/new")
def create_company_view(id):
    """ the form that goes with the new company"""
    pass
```

In [86]:
from sqlalchemy import create_engine

In [101]:
eng = create_engine("sqlite:////Users/stein/repos/ml-law/eula-scan/TOS.sqlite")


In [97]:
eng.execute("select * from company where id=?", (2,)).fetchall()

[]

In [99]:
c = eng.connect()

In [100]:
c.commit()

AttributeError: 'Connection' object has no attribute 'commit'

In [103]:
eng.dispose()

In [104]:
eng

Engine(sqlite:////Users/stein/repos/ml-law/eula-scan/TOS.sqlite)

In [106]:
eng.execute("")

<sqlalchemy.engine.result.ResultProxy at 0x115171160>

In [1]:
import difflib

In [7]:
a = difflib.HtmlDiff().make_table("test\none", "test\none")

In [8]:
a

'\n    <table class="diff" id="difflib_chg_to0__top"\n           cellspacing="0" cellpadding="0" rules="groups" >\n        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\n        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>\n        \n        <tbody>\n            <tr><td class="diff_next"><a href="#difflib_chg_to0__top">t</a></td><td class="diff_header" id="from0_1">1</td><td nowrap="nowrap">t</td><td class="diff_next"><a href="#difflib_chg_to0__top">t</a></td><td class="diff_header" id="to0_1">1</td><td nowrap="nowrap">t</td></tr>\n            <tr><td class="diff_next"></td><td class="diff_header" id="from0_2">2</td><td nowrap="nowrap">e</td><td class="diff_next"></td><td class="diff_header" id="to0_2">2</td><td nowrap="nowrap">e</td></tr>\n            <tr><td class="diff_next"></td><td class="diff_header" id="from0_3">3</td><td nowrap="nowrap">s</td><td class="diff_next"></td><td class="diff_header" id="to0_3">3</td><td nowrap="nowrap">s</td></