In [None]:
# # display most watched

# from yourtube.file_operations import get_youtube_watched_ids

# w = get_youtube_watched_ids()
# l = list(w.items())
# l.sort(key=lambda pair: len(pair[1]), reverse=True)
# most_watched, watch_times_lists = zip(*l)

# display_video_links(G, most_watched[:8])

In [None]:
try:
    %load_ext lab_black
except ModuleNotFoundError:
    pass

In [None]:
import os
import sys
import textwrap
import random
import functools

import networkx as nx
import numpy as np
import webbrowser
from time import time
from collections import Counter
from IPython.core.display import display, HTML
from IPython.display import Javascript
from scipy.cluster.hierarchy import cut_tree, to_tree, leaves_list
from ipywidgets import Button, HBox, VBox, Output, Layout, Image, Checkbox
from ipyevents import Event
from krakow import krakow
from krakow.utils import (
    plot_dendrogram,
    normalized_dasgupta_cost,
    split_into_n_children,
)

sys.path.append(os.path.abspath(".."))

import matplotlib.pyplot as plt

plt.style.use("dark_background")

from yourtube.scraping import scrape_from_list
from yourtube.file_operations import (
    save_graph,
    load_graph,
    id_to_url,
)

# import matplotlib.cm as cm
# import matplotlib.pyplot as plt

balance = 1.8
num_of_columns = 3
videos_in_column = 5

# id_to_thumbnail = "https://i.ytimg.com/vi/{}/mqdefault.jpg"
id_to_thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg"
# mq < hq < sd < maxres

In [None]:
G = load_graph()

In [None]:
# ranking functions


def rank_nodes_by_in_degree(G, source_videos, nodes):
    source_videos = set(source_videos)
    node_ranks = dict()
    # if there if nothing, return nothing
    if not nodes:
        return nodes, node_ranks

    for node in nodes:
        in_edges = G.in_edges(node)
        in_nodes = {u for u, v in in_edges}
        rank = len(in_nodes & source_videos)
        node_ranks[node] = rank

    recs = sorted(node_ranks.items(), key=lambda pair: pair[1], reverse=True)
    ids, scores = zip(*recs)
    return ids, node_ranks


# def recommend(SubG, pickiness=0):
#     # this line recommends very normie videos
#     # it's equivalent to pickiness==0
#     # recs = sorted(SubG.in_degree(), key=lambda pair: pair[1], reverse=True)

#     # to be honest, I don't fully understand this part
#     # but it works better than the one on top:
#     # first limit recs only to the best ones
#     # this way, we'll omit most general normie recommendations later
#     best_recs = [node for node, in_degree in SubG.in_degree() if in_degree >= pickiness]
#     #     best_recs = [node for node, degree in SubG.degree() if degree >= pickiness]

#     ids, scores = sort_nodes_by_in_degree(SubG, best_recs)
#     return ids, scores


def liked_to_views_ratio(G, id_):
    node = G.nodes[id_]
    try:
        return node["like_count"] / node["view_count"]
    except (KeyError, TypeError, ZeroDivisionError):
        return -1

In [None]:
# filtering functions


def added_in_last_n_years(G, ids, n=5):
    seconds_in_year = 60 * 60 * 24 * 365
    start_time = time() - seconds_in_year * n

    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if "time_added" not in node:
            continue
        if start_time < node["time_added"]:
            filterd_ids.append(id_)
    return filterd_ids


def only_not_watched(G, ids):
    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if not node.get("watched_times"):
            filterd_ids.append(id_)
    return filterd_ids


def only_watched(G, ids):
    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if node.get("watched_times"):
            filterd_ids.append(id_)
    return filterd_ids


def get_neighborhood(G, ids):
    out_edges = G.out_edges(ids)
    return list(G.edge_subgraph(out_edges).nodes)


def from_category(G, ids, categories):
    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if node.get("category") in categories:
            filterd_ids.append(id_)
    return filterd_ids

In [None]:
def cluster_subgraph(nodes_to_cluster, G, balance=2, plot=False):
    RecentDirected = G.subgraph(nodes_to_cluster)
    Recent = RecentDirected.to_undirected()

    # choose only the biggest connected component
    components = sorted(nx.connected_components(Recent), key=len, reverse=True)
    # for el in components[:5]:
    #     print(len(el))
    main_component = components[0]
    Main = Recent.subgraph(main_component)

    D = krakow(Main, balance=balance)
    tree = to_tree(D)

    if plot:
        plot_dendrogram(D, clusters_limit=200, width=22)
    # normalized_dasgupta_cost(Main, D)

    def convert_leaf_values_to_original_ids(tree, Graph):
        main_ids_list = np.array(Graph.nodes)

        def substitute_video_id(leaf):
            leaf.id = main_ids_list[leaf.id]

        tree.pre_order(substitute_video_id)

    convert_leaf_values_to_original_ids(tree, Main)
    return tree

In [None]:
def display_video_links(G, ids, node_ranks=None, text_width=42, text_height=4):
    """It assumes that all the ids passed here were already scraped."""

    def window_open(_, url):
        webbrowser.open(url)
        # alternative is to use Javascript https://stackoverflow.com/a/61900572/11756613
        # and it works even when jupyter is remote
        # but here, when called by an event, it's broken for some reason

    for id_ in ids:
        if "is_down" in G.nodes[id_]:
            continue

        url = id_to_url.format(id_)
        title = G.nodes[id_]["title"]

        image_url = id_to_thumbnail.format(id_)
        img = Image.from_url(image_url)
        event = Event(source=img, watched_events=["click"])
        func = functools.partial(window_open, url=url)
        event.on_dom_event(func)
        display(img)

        rank = node_ranks[id_] if node_ranks is not None else None
        likes_to_views = liked_to_views_ratio(G, id_)
        likes_to_views = int(likes_to_views * 1000)
        print(f"rank: {rank}   l/v: {likes_to_views}")

        # make title wrap correctly and always take up the same number of lines
        # maximum youtube title length is 100 chars
        title = textwrap.wrap(title, width=text_width)
        title += [""] * (text_height - len(title))
        print("\n".join(title))

        # display(HTML(f"""<a href="{url}">{prefix} {title}</a>"""))
        # display(HTML(f"""<a href="{url}"><img src="{image_url}"></a>"""))
        # display(HTML(f"""<textarea rows="3">{prefix} {title}</textarea>"""))

In [None]:
class Categories:
    def __init__(self, G):
        # get all categories
        categories = []
        for id_, node in G.nodes.data():
            if "category" in node:
                categories.append(node["category"])

        # sort categories by their importance
        counts = Counter(categories)
        counts = list(counts.items())
        counts.sort(key=lambda i: i[1], reverse=True)
        # print(counts)
        category_names = [name for name, count in counts]

        # construct category checkboxes
        self.category_checkboxes = []
        for category_name in category_names:
            checkbox = Checkbox(description=category_name, value=True)
            self.category_checkboxes.append(checkbox)

        select_all_button = Button(description="Select all")
        deselect_all_button = Button(description="Deselect all")
        select_all_button.on_click(self.select_all)
        deselect_all_button.on_click(self.deselect_all)

        self.checkboxes_vbox = VBox(
            [select_all_button, deselect_all_button] + self.category_checkboxes
        )

    def select_all(self, _):
        for category_checkbox in self.category_checkboxes:
            category_checkbox.value = True

    def deselect_all(self, _):
        for category_checkbox in self.category_checkboxes:
            category_checkbox.value = False

    def get_checked_categories_list(self):
        checked = []
        for category_checkbox in self.category_checkboxes:
            if category_checkbox.value:
                checked.append(category_checkbox.description)
        return checked

In [None]:
def select_nodes_to_cluster(G, use_watched=False, categories="all"):
    sources = added_in_last_n_years(G, G.nodes, n=5)

    if use_watched:
        watched = only_watched(G, G.nodes)
        sources = set(sources) | set(watched)

    if categories != "all":
        sources = from_category(G, sources, categories)

    return get_neighborhood(G, sources)

In [None]:
class VideoWall:
    def __init__(
        self,
        G,
        num_of_columns=3,
        videos_in_column=5,
        width=1000,
    ):
        self.G = G
        self.categories_selector = Categories(G)
        self.num_of_columns = num_of_columns
        self.videos_in_column = videos_in_column
        self.random_seed = random.randint(0, 1000000)

        column_width = width / self.num_of_columns
        layout = Layout(width=f"{column_width}px")
        self.columns = [Output(layout=layout) for _ in range(num_of_columns)]
        self.message_output = Output()
        go_back_button = Button(description="Go back")
        go_back_button.on_click(self.go_back)
        self.hide_watched_checkbox = Checkbox(description="Hide watched", value=True)
        self.hide_watched_checkbox.observe(self.update_displayed_videos, names="value")
        self.use_watched_checkbox = Checkbox(
            description="Use watched videos", value=False
        )
        if len(added_in_last_n_years(self.G, self.G.nodes)) < 400:
            # if there are too few videos in playlists, it's better to also use watched videos
            self.use_watched_checkbox.value = True
        self.use_watched_checkbox.observe(self.recluster, names="value")
        self.recluster_button = Button(description="Recluster")
        self.recluster_button.on_click(self.recluster)

        self.whole_output = VBox(
            [
                HBox(
                    [
                        go_back_button,
                        self.hide_watched_checkbox,
                        self.use_watched_checkbox,
                        self.recluster_button,
                    ]
                ),
                self.message_output,
                HBox(self.columns + [self.categories_selector.checkboxes_vbox]),
            ]
        )

        # bind middle click to choose_column
        for i, output in enumerate(self.columns):
            event = Event(source=output, watched_events=["auxclick"])
            func = functools.partial(self.choose_column, i=i)
            event.on_dom_event(func)

        # bind click on categories to a reload
        event = Event(
            source=self.categories_selector.checkboxes_vbox, watched_events=["click"]
        )
        event.on_dom_event(self.update_displayed_videos)

        self.recluster(_)

    def recluster(self, _):
        to_cluster = select_nodes_to_cluster(
            self.G,
            use_watched=self.use_watched_checkbox.value,
            categories=self.categories_selector.get_checked_categories_list(),
        )
        with self.message_output:
            self.message_output.clear_output()
            print("number of videos: ", len(to_cluster))
        for column in self.columns:
            with column:
                column.clear_output()

        tree = cluster_subgraph(to_cluster, self.G, balance=balance, plot=False)

        self.tree = tree
        self.path = []
        video_ids = self.tree.pre_order()
        self.source_videos = added_in_last_n_years(self.G, video_ids)
        # TODO these may not really be source videos!

        self.update_displayed_videos(_)

    def choose_column(self, event, i):
        self.path.append(self.tree)
        self.tree = self.children[i]
        self.message_output.clear_output()
        self.update_displayed_videos(_)

    def go_back(self, _):
        if self.path == []:
            with self.message_output:
                self.message_output.clear_output()
                print("already on the highest cluster")
            return
        self.tree = self.path.pop()
        self.update_displayed_videos(_)

    def update_displayed_videos(self, _):
        if self.tree.count < self.num_of_columns:
            with self.message_output:
                self.message_output.clear_output()
                print("already on the lowest cluster")
            return

        self.children = split_into_n_children(self.tree, n=self.num_of_columns)

        all_ranked_ids = []
        all_node_ranks = dict()
        cluster_sizes = []
        for child in self.children:
            ids = child.pre_order()
            if self.hide_watched_checkbox.value:
                ids = only_not_watched(self.G, ids)
            ids = from_category(
                self.G, ids, self.categories_selector.get_checked_categories_list()
            )
            cluster_sizes.append(len(ids))

            ranked_ids, node_ranks = rank_nodes_by_in_degree(
                self.G, self.source_videos, ids
            )
            all_node_ranks = node_ranks | all_node_ranks

            top = min(60, len(ranked_ids) // 6)
            top = max(top, self.videos_in_column)
            ranked_ids = ranked_ids[:top]
            ranked_ids = list(ranked_ids)
            random.seed(self.random_seed)
            random.shuffle(ranked_ids)
            ranked_ids = ranked_ids[: self.videos_in_column]
            all_ranked_ids.append(ranked_ids)

        # scrape all the absent videos
        to_scrape = [id_ for sub in all_ranked_ids for id_ in sub]
        scrape_from_list(
            to_scrape, self.G, skip_if_fresher_than=float("inf"), non_verbose=True
        )

        for column, ranked_ids, cluster_size in zip(
            self.columns, all_ranked_ids, cluster_sizes
        ):
            with column:
                column.clear_output(wait=True)
                print("total videos: ", cluster_size)
                display_video_links(self.G, ranked_ids, all_node_ranks)

        save_graph(self.G)

In [None]:
# B = load_graph("basia")

In [None]:
video_wall = VideoWall(G, num_of_columns, videos_in_column)

video_wall.whole_output