In [None]:
# ideas:
# for classification use only videos from one cluster
# create a better embedding, by creating a graph into feature space, with spring forces
# better test set, because likes are very noisy - explicitly rank different videos (for startersonly in some cluster)

In [None]:
try:
    %load_ext lab_black
except ModuleNotFoundError:
    print("nb_black not installed")

In [None]:
import os
import sys
import pickle

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append(os.path.abspath(".."))
from yourtube.file_operations import load_graph, id_to_url

home = os.path.expanduser("~")
embeddings_path = os.path.join(home, ".yourtube", "embeddings.pickle")

with open(embeddings_path, "rb") as file:
    embeddings = pickle.load(file)

In [None]:
G = load_graph()

In [None]:
# hard copied from YourTube.ipynb
# TODO maybe this should be factored out into some helper file

import functools
import textwrap
from IPython.core.display import display, HTML
from IPython.display import Javascript
from ipywidgets import Button, HBox, VBox, Output, Layout, Image, Checkbox
from ipyevents import Event

id_to_thumbnail = "https://i.ytimg.com/vi/{}/hqdefault.jpg"


def liked_to_views_ratio(G, id_):
    node = G.nodes[id_]
    try:
        return node["like_count"] / node["view_count"]
    except (KeyError, TypeError, ZeroDivisionError):
        return -1


def display_video_links(G, ids, node_ranks=None, text_width=42, text_height=4):
    """It assumes that all the ids passed here were already scraped."""

    def window_open(_, url):
        webbrowser.open(url)
        # alternative is to use Javascript https://stackoverflow.com/a/61900572/11756613
        # and it works even when jupyter is remote
        # but here, when called by an event, it's broken for some reason

    for id_ in ids:
        if "is_down" in G.nodes[id_]:
            continue

        url = id_to_url.format(id_)
        title = G.nodes[id_]["title"]

        image_url = id_to_thumbnail.format(id_)
        img = Image.from_url(image_url)
        event = Event(source=img, watched_events=["click"])
        func = functools.partial(window_open, url=url)
        event.on_dom_event(func)
        display(img)

        rank = node_ranks[id_] if node_ranks is not None else None
        likes_to_views = liked_to_views_ratio(G, id_)
        likes_to_views = int(likes_to_views * 1000)
        print(f"rank: {rank}   l/v: {likes_to_views}")

        # make title wrap correctly and always take up the same number of lines
        # maximum youtube title length is 100 chars
        title = textwrap.wrap(title, width=text_width)
        title += [""] * (text_height - len(title))
        print("\n".join(title))

        # display(HTML(f"""<a href="{url}">{prefix} {title}</a>"""))
        # display(HTML(f"""<a href="{url}"><img src="{image_url}"></a>"""))
        # display(HTML(f"""<textarea rows="3">{prefix} {title}</textarea>"""))

In [None]:
# get all categories
categories = []
for id_, node in G.nodes.data():
    if "category" in node:
        categories.append(node["category"])

In [None]:
X = {category: [] for category in categories}
y = {category: [] for category in categories}
ids = {category: [] for category in categories}

for id_ in embeddings.keys():
    time_added = G.nodes[id_].get("time_added")
    category = G.nodes[id_].get("category")

    X[category].append(embeddings[id_])
    ids[category].append(id_)

    if time_added:
        y[category].append(1)
    else:
        y[category].append(0)

In [None]:
# category = "Education"
# category = "Comedy"
category = "Entertainment"

In [None]:
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X[category], y[category], ids[category], test_size=0.4, random_state=1
)

clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]

ranked = sorted(zip(y_pred, ids_test))
ids_ascending = [id_ for rank, id_ in ranked]

In [None]:
display_video_links(G, ids_ascending[-10:])  # best
# display_video_links(G, ids_ascending[:10])  # worst

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
sims = cosine_similarity(X_test)
np.fill_diagonal(sims, 0)

In [None]:
# np.unravel_index(sims.argmax(), sims.shape)

In [None]:
start = 20
for i, j in enumerate(sims.argmax(axis=0)[start : start + 10], start=start):
    ids_pair = [
        ids_test[i],
        ids_test[j],
    ]
    display_video_links(G, ids_pair)

In [None]:
ids_test[1564]

In [None]:
ids_test[2106]