In [None]:
%pip install pandas scipy numpy scikit-learn matplotlib implicit tqdm requests py-spy

In [1]:
import pandas as pd
import numpy as np
import implicit
import requests
from tqdm import tqdm
from scipy.sparse import csr_matrix
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
actors_df = pd.read_csv("actors.csv", names=["actor"])
actor_to_id = {actor: idx for idx, actor in enumerate(actors_df["actor"])}
id_to_actor = {idx: actor for actor, idx in actor_to_id.items()}
safe_get_id = lambda x: actor_to_id.get(x, None)

# Load follows data, converting actor and target identifiers using the safe mapping
df = pd.concat(
    [
        chunk
        for chunk in tqdm(
            pd.read_csv(
                "follows.csv",
                usecols=[0, 1],
                names=["actor", "target"],
                converters={"actor": safe_get_id, "target": safe_get_id},
                chunksize=1_000_000,
            ),
            desc="Loading data",
            total=152,
            unit="chunks",
        )
    ]
)

print(f"Loaded {len(df)} follows")

# Drop rows where either actor_id or target_id is None (meaning they were not found in the map)
df = df.dropna().reset_index(drop=True)
df["actor"] = df["actor"].astype(int)
df["target"] = df["target"].astype(int)

print("Dropped rows with missing actors")

# Drop any actors that follow more than 5,000 other actors and remove them from both sides of the follows
actor_counts = df["actor"].value_counts()
df = df[df["actor"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df[df["target"].isin(actor_counts[actor_counts <= 5_000].index)]
df = df.reset_index(drop=True)

print("Dropped actors following more than 5,000 other actors")

df.head()

Loading data: 155chunks [03:14,  1.26s/chunks]                     


Loaded 154396387 follows
Dropped rows with missing actors
Dropped actors following more than 5,000 other actors


Unnamed: 0,actor,target
0,3587154,121789
1,5387692,3980839
2,4037694,3578576
3,3172038,612835
4,3127203,1249260


In [3]:
num_actors = len(actor_to_id)
num_targets = num_actors

follow_matrix = csr_matrix(
    (np.ones(len(df)), (df["actor"], df["target"])), shape=(num_actors, num_targets)
)

In [4]:
# Initialize the ALS model
als_model = implicit.als.AlternatingLeastSquares(
    factors=350, iterations=100, regularization=0.1, use_gpu=True
)

# Train the model
als_model.fit(follow_matrix)

100%|██████████| 100/100 [15:23<00:00,  9.24s/it]


In [5]:
# Move the model to CPU
als_model = als_model.to_cpu()

  check_blas_config()


In [18]:
def get_actor_handle(actor_did):
    resp = requests.get(f"https://plc.jazco.io/{actor_did}")
    if "handle" in resp.json():
        return resp.json()["handle"]
    return actor_did


def get_actor_handles(actor_dids):
    resp = requests.post("https://plc.jazco.io/batch/by_did", json=actor_dids)
    for actor in resp.json():
        if "handle" in actor:
            yield actor["handle"]
        else:
            yield actor["did"]


def get_actor_dids(actor_handles):
    resp = requests.post("https://plc.jazco.io/batch/by_handle", json=actor_handles)
    for actor in resp.json():
        if "did" in actor:
            yield actor["did"]
        else:
            yield actor["handle"]


def get_actor_did(handle):
    resp = requests.get(f"https://plc.jazco.io/{handle}")
    return resp.json()["did"]


def aggregate_recommendations(interested_actors, N=20):
    """
    Aggregate recommendations for a new user based on their interested actors.

    :param interested_actors: List of actor IDs the new user might be interested in.
    :param N: Number of recommendations to make.
    :return: List of recommended actor handles and their scores.
    """
    actor_scores = defaultdict(float)
    actor_ids = [actor_to_id[actor] for actor in interested_actors]

    for actor_name in interested_actors:
        if actor_name in actor_to_id:
            actor_id = actor_to_id[actor_name]
            # Retrieve N most similar actors for each actor of interest
            similar_actors = als_model.similar_items(
                actor_id,
                N=N + 1,
                filter_items=[actor_id] + actor_ids,
            )  # +1 to account for the actor itself being included

            for similar_actor_id, score in zip(*similar_actors):
                if similar_actor_id != actor_id:  # Exclude the actor itself
                    if similar_actor_id in actor_scores:
                        # Boost the score of actors that are similar to multiple actors of interest
                        actor_scores[similar_actor_id] += score / len(interested_actors)
                    else:
                        actor_scores[similar_actor_id] += score

    # Normalize scores for each actor by the number of actors of interest
    for actor_id in actor_scores:
        actor_scores[actor_id] /= len(interested_actors)

    # Deduplicate actors of interest
    interested_actors = set(interested_actors)

    # Sort actors by aggregated score and select top N
    recommended_actors = sorted(actor_scores.items(), key=lambda x: x[1], reverse=True)[
        :N
    ]

    handles = list(
        get_actor_handles([id_to_actor[actor_id] for actor_id, _ in recommended_actors])
    )

    # Convert actor ids back to handles
    recommended_actors = [
        (handles[i], score) for i, (actor_id, score) in enumerate(recommended_actors)
    ]

    return recommended_actors


# A new user interested in specific actors
interested_actor_handles = [
    "shreyanjain.net",
    "why.bsky.team",
    "jacob.gold",
    "mary.my.id",
]

# Convert actor handles to actor IDs
interested_actors = list(get_actor_dids(interested_actor_handles))

recommended_actors = aggregate_recommendations(interested_actors, N=20)

# Convert to a DataFrame for better visualization
recommended_actors_df = pd.DataFrame(recommended_actors, columns=["actor", "score"])
recommended_actors_df

Unnamed: 0,actor,score
0,futur.blue,0.282176
1,samuel.bsky.team,0.27571
2,mackuba.eu,0.273446
3,foysal.codes,0.271685
4,matthieu.bsky.team,0.270639
5,emmanuel.bsky.nyc,0.269637
6,ovna.dev,0.266502
7,esb.lol,0.264553
8,dholms.xyz,0.262718
9,bnewbold.net,0.257553


In [7]:
als_model.save("follow_recommendation_als.npz")