In [55]:
import requests
import bs4
import difflib

import sqlite3
c = sqlite3.connect('TOS.sqlite')

In [105]:
c.execute("""
CREATE TABLE IF NOT EXISTS company(
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
    url TEXT,
    name TEXT,
    last_scan INT,
    last_error INT,
    scan_instructions TEXT
)
""")

c.execute("""
CREATE TABLE IF NOT EXISTS tos_text(
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
    company_id INT,
    start_date INT,
    end_date INT,
    text BLOB,
    formatted_text BLOB,
    delta BLOB,
    formatted_delta BLOB
)
""")

c.execute("""
CREATE INDEX IF NOT EXISTS tos_text_company_idx ON tos_text(company_id)
""")
c.commit()

In [117]:
def do_update_check(company):
    url = lookup_URL(company_id)
    old_tos = lookup_TOS(company_id)
    new_tos = pull_TOS(company_id)
    text_diff = list(difflib.context_diff(new_tos['text'].split('\n'), old_tos['text'].split('\n')))
    format_diff = list(difflib.context_diff(new_tos['format'].split('\n'), new_tos['format'].split('\n')))
    if len(text_diff) > 0:
        pass
    if len(format_diff) > 0:
        pass
    return

def pull_TOS(url):
    r = requests.get(url, timeout=10)
    assert r.status_code == 200, 'error, code: ' + r.status_code
    soup = bs4.BeautifulSoup(r.text)
    ps = soup.body.findAll("p")  ## this doesn't work.
    return {
        "text": soup.body.text,  #"\n\n".join(p.text for p in ps),
        "format": soup.body,     #"\n\n".join(str(p) for p in ps),
    }

def create_company(name, url, settings):
    c.execute('INSERT INTO company(name, url) values (?,?)', (name, url))
    c.commit()

def update_company_name(id, name=None):
    c.execute('UPDATE company SET name=? WHERE id=?', (name, id))
    c.commit()

def update_company_url(id, url=None):
    c.execute('UPDATE company SET url=? WHERE id=?', (name, id))
    c.commit()

def update_last_scan(id, last_scan=None):
    c.execute('UPDATE company SET last_scan WHERE id=?', (last_scan, id))
    c.commit()

def update_last_error(company_id, error, dt):
    c.commit();

def add_TOS(company_id, text, formatted_text, delta, formatted_delta):
    last_tos = c.execute("select id from tos_text where company_id=? order by start_date desc", company_id)
    if last_tos:
        raise("need to add end date to previous TOS")
    c.execute(
        '''
        INSERT INTO tos_text(company_id, start_date, text, formatted_text, delta, formatted_delta) 
        values (?,?,?,?,?,?)
        ''', 
        (company_id, start_date, text, formatted_text, delta, formatted_delta)
    )
    c.commit()

def lookup_URL(company_id):
    return c.execute("select url from company where id=?", (company_id,)).fetchone()[0]

def lookup_TOS(company_id):
    last_tos = c.execute("""
        select text, formatted_text from tos_text where company_id=? 
        order by start_date desc
        """, (company_id,)
    ).fetchone()
    if not last_tos:
        return None
    else:
        raise("not implemented")
        


In [3]:
dropbox = pull_TOS("https://www.dropbox.com/dmca")

In [124]:
create_company("_STEIN TEST", "https://davidbstein.github.io/ml-law/index.html", {})

In [125]:
url = lookup_URL(1298)

In [127]:
pull_TOS(url)

{'text': '\n\nml-law\nml projects relating to legal things.\n\n        This site is open source. Improve this page.\n      \n\n\nanchors.add();\n',
 'format': <body>
 <div class="container-lg px-3 my-5 markdown-body">
 <h1 id="ml-law">ml-law</h1>
 <p>ml projects relating to legal things.</p>
 <div class="footer border-top border-gray-light mt-5 pt-3 text-right text-gray">
         This site is open source. <a href="https://github.com/davidbstein/ml-law/edit/master/docs/README.md">Improve this page</a>.
       </div>
 </div>
 <script crossorigin="anonymous" integrity="sha256-lZaRhKri35AyJSypXXs4o6OPFTbTmUoltBbDCbdzegg=" src="https://cdnjs.cloudflare.com/ajax/libs/anchor-js/4.1.0/anchor.min.js"></script>
 <script>anchors.add();</script>
 </body>}

In [119]:
c.commit()

# temporary stubs

In [31]:
total = len(policy_urls)
for id_, url in enumerate(policy_urls):
    print(id_, '/', total)
    if not (True):
        print("skipping...")
        continue
    try:
        stored_tos[id_] = pull_TOS(url)
        print("success")
    except Exception as e:
        print(url, 'error...', e)

0 / 327
success
1 / 327
success
2 / 327
success
3 / 327
success
4 / 327
http://www.blogger.com/privacy error... 'NoneType' object has no attribute 'findAll'
5 / 327
http://www.bolt.com/main/authorization/privacyPolicy error... error!
6 / 327
http://www.cellufun.com/about/privacy.asp error... error!
7 / 327
http://www.collegeblender.com/privacy error... error!
8 / 327
http://dailybooth.com/privacy error... HTTPConnectionPool(host='dailybooth.com', port=80): Max retries exceeded with url: /privacy (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1088f5e80>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
9 / 327
http://www.dailystrength.org/content/view/201 error... error!
10 / 327
success
11 / 327
http://aboutus.disaboom.com/Privacy-Policy.aspx error... HTTPConnectionPool(host='aboutus.disaboom.com', port=80): Max retries exceeded with url: /Privacy-Policy.aspx (Caused by NewConnectionError('<urllib3.connection

success
100 / 327
success
101 / 327
http://geek.net/privacy-statement error... HTTPConnectionPool(host='geek.net', port=80): Max retries exceeded with url: /privacy-statement (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x107ad2d30>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
102 / 327
http://cbsiprivacy.custhelp.com/app/answers/detail/a_id/1268/session/L2F2LzEvc2lkL19HMll4eDNr error... HTTPConnectionPool(host='cbsiprivacy.custhelp.com', port=80): Max retries exceeded with url: /app/answers/detail/a_id/1268/session/L2F2LzEvc2lkL19HMll4eDNr (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x107ad2f60>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
103 / 327
http://www.filefront.com/privacy.php error... error!
104 / 327
success
105 / 327
success
106 / 327
success
107 / 327
http://www.winamp.com/legal/privacy error... error!
108 /

http://www.reunion.com/PrivacyPolicy.pub error... error!
215 / 327
success
216 / 327
success
217 / 327
http://www.stickam.com/about/privacy.do error... HTTPConnectionPool(host='www.stickam.com', port=80): Max retries exceeded with url: /about/privacy.do (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x108feba20>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
218 / 327
http://www.student.com/privacy.php error... error!
219 / 327
success
220 / 327
success
221 / 327
success
222 / 327
http://www.outeverywhere.com/gay/privacy.cgi error... error!
223 / 327
http://www.faces.com/privacy error... error!
224 / 327
success
225 / 327
success
226 / 327
http://uk.wasabi.com/Privacy.aspx error... HTTPConnectionPool(host='uk.wasabi.com', port=80): Max retries exceeded with url: /Privacy.aspx (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x107d77780>: Failed to establish a new connection: [Errno

In [32]:
import json
with open("TOS.json", "w") as f:
    f.write(json.dumps(stored_tos))

def load_tos():
    with open("TOS.json") as f:
        return json.loads(f.read())
