# Alexandria

In [409]:
from urllib.parse import urlparse, parse_qs
import randomcolor
import datetime
import matplotlib.pyplot as plt
import operator as ops
import requests
import time
import json
import zlib
import base64
import itertools
import random
from termcolor import colored
import networkx as nx
from pyvis.network import Network

rcolor = randomcolor.RandomColor()

def L(x, y):
    return lambda x: y

## Page

In [508]:
class Page:
    def __init__(self, info, unpack=False, lead='', **kwargs):
        if ' ' in info:
            parts = info.split(' | ')
        else:
            parts = info, ''
        self.url, self.title = parts[0], ''.join(parts[1:])
        self.parse = urlparse(self.url)
        self.params = parse_qs(self.parse.query)
        self.tags = []
        self.len = len(self.url)
        self.archives = []
        
        if unpack:
#             if 'url' in self.params and self.params['url'][0].startswith(lead):
            if self.url.startswith(lead):
                nested = Page(self.params['url'][0])
                self.url = nested.url
                self.params |= nested.params
        
        exclude = {'http', 'https', 'www', 'com', 'an', 'in', 'the', 'with', 'and', 'org', 'a', 'as', 'en', 'of', 'to', 'at', 'all', 'for', 'we', 'how', 'it', 'do', 'why', 'be', 'have'}
        terms = list(itertools.chain.from_iterable(self.replace(attr.lower(), ' ').split() for attr in [self.url, self.title]))
        self.keywords = {t for t in terms if len(t) < 50 and not t.isnumeric()} - exclude
    
    def tag(self, tags, g=None):
        if type(tags) is str:
            tags = Tag(tags)
        if type(tags) is Tag:
            tags = [tags]
        
        self.tags.extend(tags)
        for tag in tags:
            g.add_edge(self.title, tag.name, weight=1)
        
        return self
    
    def replace(self, s, n):
        punctuation = '.,/_-:?()[]'
        for p in punctuation:
            s = s.replace(p, n)
        return s
    
    def print(self):
        print(str(self))
        
    def __str__(self):
        return ' | '.join([self.title, '; '.join(map(str, self.tags)), '; '.join(self.keywords), self.url[:100]])

## Tag

In [411]:
class Tag:
    def __init__(self, name=''):
        self.name = name
#         self.color = rcolor.generate()
        colors = [
            'grey',
            'red',
            'green',
            'yellow',
            'blue',
            'magenta',
            'cyan',
            'white'
        ]
        self.color = random.choice(colors)
        self.created = str(datetime.datetime.now())
        
    def __str__(self):
        return colored(self.name, self.color)
        

## Collection

In [538]:


class Collection:
    def __init__(self, urls=None, source='', encoding='utf8'):
        self.urls = []
        self.graph = nx.Graph()
        if urls:
            self.add(urls)
        
        if source:
            with open(source, 'r', encoding=encoding) as file:
                text = file.read()
                text = text.encode(encoding)
                text = base64.b64decode(text)
                text = zlib.decompress(text)
                text = text.decode(encoding)
#             print(text[:500])
    
    def load(self, path, limit=50, **kwargs):
        data = []
        with open(path, 'r', encoding='utf8') as file:
            l = 0
            for line in file:
                data.append(line)
                if l > limit:
                    break
                l += 1
        self.add(data, **kwargs)
    
    def add(self, urls, **kwargs):
        if type(urls) is str:
            urls = [urls]
        if type(urls) is list:
            for url in urls:
                if type(url) is str:
                    new = Page(url, **kwargs)
                elif type(url) is Page:
                    new = url
                self.urls.append(new)
                self.graph.add_node(new.title, color='green', label=' ')
    
    def find(self, attr, value=None):
        if not callable(attr):
            attr = lambda x: getattr(x, attr) == value
        return Collection(list(filter(attr, self.urls)))
    
    def tag(self, tags):
        if type(tags) is str:
            tags = Tag(tags)
        if type(tags) is Tag:
            tags = [tags]
        
        for u in self.urls:
            u.tags.extend(tags)
            for tag in tags:
#                 print(tag.name, u.title)
                self.graph.add_edge(u.title, tag.name, weight=1)
        
        return self
    
    def tag_if_in(self, tags):
        if type(tags) is str:
            tags = [tags]
        result = self
#         tags_ = [Tag(t) for t in tags]
        tags_ = []
        for t in tags:
            t_ = Tag(t)
            tags_.append(t_)
            self.graph.add_node(t_.name, color='purple')
        
        for t in tags_:
#             print(t.name)
#             result.find(lambda x: any(t.name.lower().replace(' ', '') in q for q in [x.title, x.url])).tag(t)
            for u in result.urls:
                if any(t.name.lower().replace(' ', '') in q for q in [u.title, u.url]):
                    u.tag(t, self.graph)
        
        return result
    
    def visualize(self, property='len'):
        summary = [getattr(u, property) for u in self.urls]
        plt.hist(summary, bins=100)
        
    def network(self, physics=False):
        self.vis = Network(width=800, height=800, notebook=True)
#         print(self.graph.nodes)
        self.vis.from_nx(self.graph)
#         self.vis.enable_physics(physics)
        self.vis.toggle_physics(physics)
#         g.barnes_hut()
        output = self.vis.show('./library-network.html')
#         return net
        return output
        
        
    def download(self, limit=1, rate=1):
        for u in self.urls[:limit]:
            text = requests.get(u.url).text
            u.archives.append(text)
            time.sleep(1/rate)
            
    def save(self, path='./alexandria-library.txt', encoding='utf-8'):
        text = json.dumps(self, default=vars)
        text = zlib.compress(text.encode(encoding))
        text = base64.b64encode(text)
        with open(path, 'w') as f:
#             f.write(text.decode(encoding, 'ignore'))
            f.write(text.decode(encoding))
#             f.write(text)
    
    def print(self, limit=100):
        for u in self.urls[:limit]:
            print(u)
    
    def __getitem__(self, i):
        return self.urls[i]

class Rule:
    def __init__(self, z, op, value, action):
        if callable(z):
            self.when = z
        else:
            self.when = lambda x: op(getattr(x, z), value)
            
    

## Testing

In [544]:
c = Collection(source='./alexandria-library.txt')
c.load('./may-28.txt', unpack=True, lead='chrome-extension://fiabciakcmgepblmdkmemdbbkilneeeh/park.html', limit=200)

# print(c.urls[100].parse)
# c.find(lambda x: len(x.url)>1000)[5]
# w = c.find(lambda x: 'Wikipedia' in x.title).tag('Wikipedia')
# w[0]
# t.created
# c.tag(t)[0].tags[0].name
# c[100].params
# c.find(lambda x: x.len < 600).visualize()

c.tag_if_in(['Wikipedia', 'Google', 'Colab', 'Stack Overflow', 'GitHub', 'Twitter', 'YouTube', 'Stack Exchange', 'Physics', 'The New York Times', 'NumPy'])
# w.tag('Page')
# [([t.name for t in g.tags], g.url[-5:]) for g in w[:50]]

# r = Rule('url', ops.eq, 'wikipedia.org', None)
# w.download()
# w.print()
# w.save()
# w.graph.edges

# c.network(physics=True)

<__main__.Collection at 0x11d473f1100>

In [545]:
net = Network(width=800, height=800, notebook=True)
net.from_nx(w.graph)
# net.show("./library-network.html")

In [455]:
# print(w.graph.edges)

[]


In [416]:
colored('test', 'red')

'\x1b[31mtest\x1b[0m'

In [417]:
# w[1].params['title']

In [418]:
output = requests.get('https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-urllib3-and-requests-modul')
output.text[:10]

'<!DOCTYPE '

In [419]:
data[1][5]

'e'

In [420]:
len(data)

52