In [1]:
!pip install pandas scipy numpy scikit-learn matplotlib implicit tqdm requests py-spy

Collecting pandas
  Downloading pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting matplotlib
  Downloading matplotlib-3.8.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting requests
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting py-spy
  Downloading py_spy-0.3.14-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (16 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.1-cp312-cp312-ma

In [20]:
import pandas as pd
import numpy as np
import implicit
import requests
from tqdm import tqdm
from scipy.sparse import csr_matrix
from collections import defaultdict

In [2]:
actors_df = pd.read_csv("actors.csv", names=["actor"])
actor_to_id = {actor: idx for idx, actor in enumerate(actors_df["actor"])}
id_to_actor = {idx: actor for actor, idx in actor_to_id.items()}
safe_get_id = lambda x: actor_to_id.get(x, None)

# Load follows data, converting actor and target identifiers using the safe mapping
df = pd.concat(
    [
        chunk
        for chunk in tqdm(
            pd.read_csv(
                "follows.csv",
                usecols=[0, 1],
                names=["actor", "target"],
                converters={"actor": safe_get_id, "target": safe_get_id},
                chunksize=100_000,
            ),
            desc="Loading data",
            total=1_500,
            unit="chunks",
        )
    ]
)

print(f"Loaded {len(df)} follows")

# Drop rows where either actor_id or target_id is None (meaning they were not found in the map)
df = df.dropna().reset_index(drop=True)
df["actor"] = df["actor"].astype(int)
df["target"] = df["target"].astype(int)

print("Dropped rows with missing actors")

# Drop any actors that follow more than 5,000 other actors and remove them from both sides of the follows
actor_counts = df["actor"].value_counts()
df = df[df["actor"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df[df["target"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df.reset_index(drop=True)

print("Dropped actors following more than 5,000 other actors")

df.head()

Loading data: 1510chunks [03:02,  8.29chunks/s]                      


Loaded 150960920 follows
Dropped rows with missing actors
Dropped actors following more than 5,000 other actors


Unnamed: 0,actor,target
0,1198096,110947
1,1011392,3386006
2,1011392,2030067
3,1011392,2379354
4,1011392,3062563


In [3]:
num_actors = len(actor_to_id)
num_targets = num_actors
print(num_actors, num_targets)

5737361 5737361


In [4]:
follow_matrix = csr_matrix(
    (np.ones(len(df)), (df["actor"], df["target"])), shape=(num_actors, num_targets)
)

In [49]:
# Initialize the ALS model
als_model = implicit.als.AlternatingLeastSquares(
    factors=100, iterations=100, regularization=0.05, use_gpu=True
)

# Train the model
# Note: implicit library expects item-user matrix, so we transpose follow_matrix
als_model.fit(follow_matrix.T)

100%|██████████| 100/100 [05:24<00:00,  3.24s/it]


In [50]:
# Move the model to CPU
als_model = als_model.to_cpu()

In [53]:
def get_actor_handle(actor_did):
    resp = requests.get(f"https://plc.jazco.io/{actor_did}")
    if "handle" in resp.json():
        return resp.json()["handle"]
    return actor_did


def get_actor_did(handle):
    resp = requests.get(f"https://plc.jazco.io/{handle}")
    return resp.json()["did"]


def aggregate_recommendations(interested_actors, N=20):
    """
    Aggregate recommendations for a new user based on their interested actors.

    :param interested_actors: List of actor IDs the new user might be interested in.
    :param N: Number of recommendations to make.
    :return: List of recommended actor handles and their scores.
    """
    actor_scores = defaultdict(float)

    for actor_name in interested_actors:
        if actor_name in actor_to_id:
            actor_id = actor_to_id[actor_name]
            # Retrieve N most similar actors for each actor of interest
            similar_actors = als_model.similar_users(
                actor_id, N + 1
            )  # +1 to account for the actor itself being included

            for similar_actor_id, score in zip(*similar_actors):
                if similar_actor_id != actor_id:  # Exclude the actor itself
                    actor_scores[similar_actor_id] += score

    # Boost scores of actors that are followed by multiple actors of interest
    for actor_name in interested_actors:
        if actor_name in actor_to_id:
            actor_id = actor_to_id[actor_name]
            actor_followers = follow_matrix[:, actor_id].nonzero()[0]
            for follower_id in actor_followers:
                actor_scores[follower_id] += actor_scores[follower_id] * (
                    1 / len(interested_actors)
                )

    # Deduplicate actors of interest
    interested_actors = set(interested_actors)

    # Strip out actors of interest from recommendations
    actor_scores = {
        actor_id: score
        for actor_id, score in actor_scores.items()
        if id_to_actor[actor_id] not in interested_actors
    }

    # Sort actors by aggregated score and select top N
    recommended_actors = sorted(actor_scores.items(), key=lambda x: x[1], reverse=True)[
        :N
    ]

    # Convert actor ids back to handles
    recommended_actors = [
        (get_actor_handle(id_to_actor[actor_id]), score)
        for actor_id, score in recommended_actors
    ]

    return recommended_actors


# A new user interested in specific actors
interested_actors = [
    get_actor_did("shreyanjain.net"),
    get_actor_did("why.bsky.team"),
    get_actor_did("robpike.io"),
    get_actor_did("jacob.gold"),
    get_actor_did("mary.my.id"),
]

recommended_actors = aggregate_recommendations(interested_actors, N=20)

# Convert to a DataFrame for better visualization
recommended_actors_df = pd.DataFrame(recommended_actors, columns=["actor", "score"])
recommended_actors_df

Unnamed: 0,actor,score
0,samuel.bsky.team,4.62647
1,mackuba.eu,3.824495
2,marshal.dev,3.750353
3,futur.blue,3.706608
4,dholms.xyz,3.25004
5,emmanuel.bsky.nyc,2.70613
6,matthieu.bsky.team,2.612449
7,ovna.dev,2.574874
8,divy.zone,1.916792
9,jik.wtf,1.856525
