In [None]:
%pip install pandas scipy numpy scikit-learn matplotlib implicit tqdm requests py-spy

In [2]:
import pandas as pd
import numpy as np
import implicit
import requests
from tqdm import tqdm
from scipy.sparse import csr_matrix
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
actors_df = pd.read_csv("actors.csv", names=["actor"])
actor_to_id = {actor: idx for idx, actor in enumerate(actors_df["actor"])}
id_to_actor = {idx: actor for actor, idx in actor_to_id.items()}
safe_get_id = lambda x: actor_to_id.get(x, None)

# Load follows data, converting actor and target identifiers using the safe mapping
df = pd.concat(
    [
        chunk
        for chunk in tqdm(
            pd.read_csv(
                "follows.csv",
                usecols=[0, 1],
                names=["actor", "target"],
                converters={"actor": safe_get_id, "target": safe_get_id},
                chunksize=100_000,
            ),
            desc="Loading data",
            total=1_500,
            unit="chunks",
        )
    ]
)

print(f"Loaded {len(df)} follows")

# Drop rows where either actor_id or target_id is None (meaning they were not found in the map)
df = df.dropna().reset_index(drop=True)
df["actor"] = df["actor"].astype(int)
df["target"] = df["target"].astype(int)

print("Dropped rows with missing actors")

# Drop any actors that follow more than 5,000 other actors and remove them from both sides of the follows
actor_counts = df["actor"].value_counts()
df = df[df["actor"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df[df["target"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df.reset_index(drop=True)

print("Dropped actors following more than 5,000 other actors")

df.head()

Loading data: 1544chunks [03:17,  7.81chunks/s]                      


Loaded 154396387 follows
Dropped rows with missing actors
Dropped actors following more than 5,000 other actors


Unnamed: 0,actor,target
0,3587154,121789
1,5387692,3980839
2,4037694,3578576
3,3172038,612835
4,3127203,1249260


In [4]:
num_actors = len(actor_to_id)
num_targets = num_actors

follow_matrix = csr_matrix(
    (np.ones(len(df)), (df["actor"], df["target"])), shape=(num_actors, num_targets)
)

In [5]:
# Initialize the ALS model
als_model = implicit.als.AlternatingLeastSquares(
    factors=200, iterations=100, regularization=0.1, use_gpu=True
)

# Train the model
als_model.fit(follow_matrix)

100%|██████████| 100/100 [08:37<00:00,  5.18s/it]


In [6]:
# Move the model to CPU
als_model = als_model.to_cpu()

  check_blas_config()


In [7]:
def get_actor_handle(actor_did):
    resp = requests.get(f"https://plc.jazco.io/{actor_did}")
    if "handle" in resp.json():
        return resp.json()["handle"]
    return actor_did


def get_actor_did(handle):
    resp = requests.get(f"https://plc.jazco.io/{handle}")
    return resp.json()["did"]


def aggregate_recommendations(interested_actors, N=20):
    """
    Aggregate recommendations for a new user based on their interested actors.

    :param interested_actors: List of actor IDs the new user might be interested in.
    :param N: Number of recommendations to make.
    :return: List of recommended actor handles and their scores.
    """
    actor_scores = defaultdict(float)

    for actor_name in interested_actors:
        if actor_name in actor_to_id:
            actor_id = actor_to_id[actor_name]
            # Retrieve N most similar actors for each actor of interest
            similar_actors = als_model.similar_items(
                actor_id,
                N=N + 1,
            )  # +1 to account for the actor itself being included

            for similar_actor_id, score in zip(*similar_actors):
                if similar_actor_id != actor_id:  # Exclude the actor itself
                    if similar_actor_id in actor_scores:
                        # Boost the score of actors that are similar to multiple actors of interest
                        actor_scores[similar_actor_id] += score / len(interested_actors)
                    else:
                        actor_scores[similar_actor_id] += score

    # Normalize scores for each actor by the number of actors of interest
    for actor_id in actor_scores:
        actor_scores[actor_id] /= len(interested_actors)

    # Deduplicate actors of interest
    interested_actors = set(interested_actors)

    # Strip out actors of interest from recommendations
    actor_scores = {
        actor_id: score
        for actor_id, score in actor_scores.items()
        if id_to_actor[actor_id] not in interested_actors
    }

    # Sort actors by aggregated score and select top N
    recommended_actors = sorted(actor_scores.items(), key=lambda x: x[1], reverse=True)[
        :N
    ]

    # Convert actor ids back to handles
    recommended_actors = [
        (get_actor_handle(id_to_actor[actor_id]), score)
        for actor_id, score in recommended_actors
    ]

    return recommended_actors


# A new user interested in specific actors
interested_actors = [
    get_actor_did("shreyanjain.net"),
    get_actor_did("why.bsky.team"),
    get_actor_did("jacob.gold"),
    get_actor_did("mary.my.id"),
]

recommended_actors = aggregate_recommendations(interested_actors, N=20)

# Convert to a DataFrame for better visualization
recommended_actors_df = pd.DataFrame(recommended_actors, columns=["actor", "score"])
recommended_actors_df

Unnamed: 0,actor,score
0,samuel.bsky.team,0.33422
1,futur.blue,0.285632
2,emmanuel.bsky.nyc,0.275778
3,mackuba.eu,0.275339
4,matthieu.bsky.team,0.274712
5,ovna.dev,0.271049
6,dholms.xyz,0.27042
7,bnewbold.net,0.266188
8,foysal.codes,0.26479
9,divy.zone,0.262995


In [8]:
als_model.save("follow_recommendation_als.npz")