In [None]:
try:
    %load_ext lab_black
except ModuleNotFoundError:
    print("nb_black not installed")

In [None]:
import os
import sys
import networkx as nx
import numpy as np
import functools
import webbrowser
import random
from time import time
from IPython.core.display import display, HTML
from IPython.display import Javascript
from scipy.cluster.hierarchy import cut_tree, to_tree, leaves_list
from ipywidgets import Button, HBox, VBox, Output, Layout, Image, Checkbox
from ipyevents import Event
from krakow import krakow
from krakow.utils import (
    plot_dendrogram,
    normalized_dasgupta_cost,
    split_into_n_children,
)

sys.path.append(os.path.abspath(".."))
import matplotlib.pyplot as plt

plt.style.use("dark_background")

from yourtube.scraping import scrape_from_list
from yourtube.file_operations import (
    save_graph,
    load_graph,
    id_to_url,
)

# import matplotlib.cm as cm
# import matplotlib.pyplot as plt

balance = 1.8
num_of_columns = 3
videos_in_column = 5

# id_to_thumbnail = "https://i.ytimg.com/vi/{}/mqdefault.jpg"
id_to_thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg"
# mq < hq < sd < maxres

In [None]:
G = load_graph()

In [None]:
# # display most watched
# from yourtube.file_operations import get_youtube_watched_ids

# w = get_youtube_watched_ids()
# l = list(w.items())
# l.sort(key=lambda pair: len(pair[1]), reverse=True)
# most_watched, watch_times_lists = zip(*l)

# display_video_links(G, most_watched[:8])

In [None]:
# ranking functions


def rank_nodes_by_in_degree(G, source_videos, nodes):
    source_videos = set(source_videos)
    node_ranks = dict()
    for node in nodes:
        in_edges = G.in_edges(node)
        in_nodes = {u for u, v in in_edges}
        rank = len(in_nodes & source_videos)
        node_ranks[node] = rank

    recs = sorted(node_ranks.items(), key=lambda pair: pair[1], reverse=True)
    ids, scores = zip(*recs)
    return ids, node_ranks


# def recommend(SubG, pickiness=0):
#     # this line recommends very normie videos
#     # it's equivalent to pickiness==0
#     # recs = sorted(SubG.in_degree(), key=lambda pair: pair[1], reverse=True)

#     # to be honest, I don't fully understand this part
#     # but it works better than the one on top:
#     # first limit recs only to the best ones
#     # this way, we'll omit most general normie recommendations later
#     best_recs = [node for node, in_degree in SubG.in_degree() if in_degree >= pickiness]
#     #     best_recs = [node for node, degree in SubG.degree() if degree >= pickiness]

#     ids, scores = sort_nodes_by_in_degree(SubG, best_recs)
#     return ids, scores


def liked_to_views_ratio(G, id_):
    node = G.nodes[id_]
    try:
        return node["like_count"] / node["view_count"]
    except (KeyError, TypeError, ZeroDivisionError):
        return -1

In [None]:
# filtering functions


def added_in_last_n_years(G, ids, n=5):
    seconds_in_year = 60 * 60 * 24 * 365
    start_time = time() - seconds_in_year * n

    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if "time_added" not in node:
            continue
        if start_time < node["time_added"]:
            filterd_ids.append(id_)
    return filterd_ids


def only_not_watched(G, ids):
    filterd_ids = []
    for id_ in ids:
        node = G.nodes[id_]
        if "watched_times" not in node or node["watched_times"] == []:
            filterd_ids.append(id_)
    return filterd_ids


def neighborhood(G, ids):
    out_edges = G.out_edges(ids)
    return G.edge_subgraph(out_edges).nodes

In [None]:
sources1 = added_in_last_n_years(G, G.nodes)
neighborhood1 = neighborhood(G, sources1)
wide_neighborhood1 = neighborhood(G, neighborhood1)

print("sources: ", len(sources1))
print("neighborhood: ", len(neighborhood1))
print("wide neighborhood: ", len(wide_neighborhood1))

In [None]:
# B = load_graph("basia")

In [None]:
# sources2 = added_in_last_n_years(B, B.nodes)
# neighborhood2 = neighborhood(B, sources2)
# wide_neighborhood2 = neighborhood(G, neighborhood2)

# print("sources: ", len(sources2))
# print("neighborhood: ", len(neighborhood2))
# print("wide neighborhood: ", len(wide_neighborhood2))

In [None]:
to_cluster = neighborhood1  # & neighborhood2
# to_cluster = wide_neighborhood1  # & wide_neighborhood2
# len(to_cluster)

In [None]:
RecentDirected = G.subgraph(to_cluster)
Recent = RecentDirected.to_undirected()
print("number of videos: ", Recent.number_of_nodes())

# choose only the biggest connected component
components = sorted(nx.connected_components(Recent), key=len, reverse=True)
# for el in components[:5]:
#     print(len(el))
main_component = components[0]
Main = Recent.subgraph(main_component)

D = krakow(Main, balance=balance)
tree = to_tree(D)


def convert_leaf_values_to_original_ids(tree, Graph):
    main_ids_list = np.array(Main.nodes)

    def substitute_video_id(leaf):
        leaf.id = main_ids_list[leaf.id]

    tree.pre_order(substitute_video_id)


convert_leaf_values_to_original_ids(tree, Main)

In [None]:
# plot_dendrogram(D, clusters_limit=500, width=22)
# normalized_dasgupta_cost(Main, D)

In [None]:
def display_video_links(G, ids, node_ranks=None):
    """It assumes that all the ids passed here were already scraped."""

    def window_open(_, url):
        webbrowser.open(url)
        # alternative is to use Javascript https://stackoverflow.com/a/61900572/11756613
        # and it works even when jupyter is remote
        # but here, when called by an event, it's broken for some reason

    for id_ in ids:
        if "is_down" in G.nodes[id_]:
            continue

        url = id_to_url.format(id_)
        title = G.nodes[id_]["title"]

        image_url = id_to_thumbnail.format(id_)
        img = Image.from_url(image_url)
        event = Event(source=img, watched_events=["click"])
        func = functools.partial(window_open, url=url)
        event.on_dom_event(func)
        display(img)

        rank = node_ranks[id_] if node_ranks is not None else None
        likes_to_views = liked_to_views_ratio(G, id_)
        likes_to_views = int(likes_to_views * 1000)
        print(f"rank: {rank}   l/v: {likes_to_views}")
        print(f"{title}\n\n")

        # display(HTML(f"""<a href="{url}">{prefix} {title}</a>"""))
        # display(HTML(f"""<a href="{url}"><img src="{image_url}"></a>"""))
        # display(HTML(f"""<textarea rows="3">{prefix} {title}</textarea>"""))

In [None]:
class VideoWall:
    def __init__(self, tree, G, num_of_columns=3, videos_in_column=5, width=1000):
        self.tree = tree
        self.G = G
        self.num_of_columns = num_of_columns
        self.videos_in_column = videos_in_column
        self.path = []
        self.random_seed = random.randint(0, 1000000)

        video_ids = self.tree.pre_order()
        self.source_videos = added_in_last_n_years(self.G, video_ids)
        # TODO these may not really be source videos!

        column_width = width / self.num_of_columns
        layout = Layout(width=f"{column_width}px")
        self.columns = [Output(layout=layout) for _ in range(num_of_columns)]
        self.message_output = Output()
        go_back_button = Button(description="Go back")
        go_back_button.on_click(self.go_back)
        self.hide_watched_checkbox = Checkbox(description="Hide watched", value=True)
        self.hide_watched_checkbox.observe(self.update_displayed_videos, names="value")

        self.whole_output = VBox(
            [
                HBox([go_back_button, self.hide_watched_checkbox]),
                self.message_output,
                HBox(self.columns),
            ]
        )

        # bind middle click to choose_column
        for i, output in enumerate(self.columns):
            event = Event(source=output, watched_events=["auxclick"])
            func = functools.partial(self.choose_column, i=i)
            event.on_dom_event(func)

        self.update_displayed_videos(_)

    def choose_column(self, event, i):
        self.path.append(self.tree)
        self.tree = self.children[i]
        self.message_output.clear_output()
        self.update_displayed_videos(_)

    def go_back(self, _):
        if self.path == []:
            with self.message_output:
                self.message_output.clear_output()
                print("already on the highest cluster")
            return
        self.tree = self.path.pop()
        self.update_displayed_videos(_)

    def update_displayed_videos(self, _):
        if self.tree.count < self.num_of_columns:
            with self.message_output:
                self.message_output.clear_output()
                print("already on the lowest cluster")
            return

        self.children = split_into_n_children(self.tree, n=self.num_of_columns)

        all_ranked_ids = []
        all_node_ranks = dict()
        for child in self.children:
            ids = child.pre_order()
            if self.hide_watched_checkbox.value:
                ids = only_not_watched(self.G, ids)

            ranked_ids, node_ranks = rank_nodes_by_in_degree(
                self.G, self.source_videos, ids
            )
            all_node_ranks = node_ranks | all_node_ranks

            top = min(40, len(ranked_ids) // 10)
            top = max(top, self.videos_in_column)
            ranked_ids = ranked_ids[:top]
            ranked_ids = list(ranked_ids)
            random.seed(self.random_seed)
            random.shuffle(ranked_ids)
            ranked_ids = ranked_ids[: self.videos_in_column]
            all_ranked_ids.append(ranked_ids)

        # scrape all the absent videos
        to_scrape = [id_ for sub in all_ranked_ids for id_ in sub]
        scrape_from_list(
            to_scrape, self.G, skip_if_fresher_than=float("inf"), non_verbose=True
        )

        for column, ranked_ids in zip(self.columns, all_ranked_ids):
            column.clear_output(wait=True)
            with column:
                display_video_links(self.G, ranked_ids, all_node_ranks)

        save_graph(self.G)

In [None]:
video_wall = VideoWall(tree, G, num_of_columns, videos_in_column)

video_wall.whole_output