In [268]:
from urllib.parse import urlparse, parse_qs
import randomcolor
import datetime
import matplotlib.pyplot as plt
import operator as ops
import requests
import time
import json
import zlib
import base64

rcolor = randomcolor.RandomColor()



class Page:
    def __init__(self, info, unpack=False, lead='', **kwargs):
        if ' ' in info:
            parts = info.split(' | ')
        else:
            parts = info, ''
        self.url, self.title = parts[0], ''.join(parts[1:])
        self.parse = urlparse(self.url)
        self.params = parse_qs(self.parse.query)
        self.tags = []
        self.len = len(self.url)
        self.archives = []
        if unpack:
#             if 'url' in self.params and self.params['url'][0].startswith(lead):
            if self.url.startswith(lead):
                nested = Page(self.params['url'][0])
                self.url = nested.url
                self.params |= nested.params
        
    def print(self):
        print(str(self))
        
    def __str__(self):
        return ' | '.join([self.title, '; '.join(map(str, self.tags)), self.url[:100]])
        
class Tag:
    def __init__(self, name=''):
        self.name = name
        self.color = rcolor.generate()
        self.created = str(datetime.datetime.now())
        
    def __str__(self):
        return self.name
        
class Collection:
    def __init__(self, urls=None):
        self.urls = []
        if urls:
            self.add(urls)
    
    def load(self, path, limit=50, **kwargs):
        data = []
        with open(path, 'r', encoding='utf8') as file:
            l = 0
            for line in file:
                data.append(line)
                if l > limit:
                    break
                l += 1
        self.add(data, **kwargs)
    
    def add(self, urls, **kwargs):
        if type(urls) is str:
            urls = [urls]
        if type(urls) is list:
            for url in urls:
                if type(url) is str:
                    self.urls.append(Page(url, **kwargs))
                elif type(url) is Page:
                    self.urls.append(url)
    
    def find(self, attr, value=None):
        if not callable(attr):
            attr = lambda x: getattr(x, attr) == value
        return Collection(list(filter(attr, self.urls)))
    
    def tag(self, tags):
        if type(tags) is str:
            tags = Tag(tags)
        if type(tags) is Tag:
            tags = [tags]
        for u in self.urls:
            u.tags.extend(tags)
        return self
    
    def tag_if_in(self, tags):
        if type(tags) is str:
            tags = [tags]
        result = self
        for t in tags:
            result.find(lambda x: any(t.lower().replace(' ', '') in q for q in [x.title, x.url])).tag(t)
        return result
    
    def visualize(self, property='len'):
        summary = [getattr(u, property) for u in self.urls]
        plt.hist(summary, bins=100)
        
    def download(self, limit=1, rate=1):
        for u in self.urls[:limit]:
            text = requests.get(u.url).text
            u.archives.append(text)
            time.sleep(1/rate)
            
    def save(self, path='./alexandria-library.txt', encoding='utf-8'):
        text = json.dumps(self, default=vars)
        text = zlib.compress(text.encode(encoding))
        text = base64.b64encode(text)
        with open(path, 'w') as f:
#             f.write(text.decode(encoding, 'ignore'))
            f.write(text.decode(encoding))
#             f.write(text)
    
    def print(self, limit=100):
        for u in self.urls[:limit]:
            print(u)
    
    def __getitem__(self, i):
        return self.urls[i]

class Rule:
    def __init__(self, z, op, value, action):
        if callable(z):
            self.when = z
        else:
            self.when = lambda x: op(getattr(x, z), value)
            
    
c = Collection()
c.load('./may-28.txt', unpack=True, lead='chrome-extension://fiabciakcmgepblmdkmemdbbkilneeeh/park.html')

# print(c.urls[100].parse)
# c.find(lambda x: len(x.url)>1000)[5]
# w = c.find(lambda x: 'Wikipedia' in x.title).tag('Wikipedia')
# w[0]
# t.created
# c.tag(t)[0].tags[0].name
# c[100].params
# c.find(lambda x: x.len < 600).visualize()

w = c.tag_if_in(['Wikipedia', 'Google', 'Colab', 'Stack Overflow', 'GitHub', 'Twitter', 'YouTube', 'Stack Exchange', 'Physics', 'The New York Times', 'NumPy'])
# [([t.name for t in g.tags], g.url[-5:]) for g in w[:50]]

r = Rule('url', ops.eq, 'wikipedia.org', None)
w.download()
# w.print()
w.save()
# w[1].params

In [250]:
# w[1].params['title']

In [205]:
output = requests.get('https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-urllib3-and-requests-modul')
output.text[:10]

'<!DOCTYPE '

In [35]:
data[1][5]

'e'

In [141]:
len(data)

1002