In [1]:
import os
import sys
import networkx as nx
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from pathlib import Path
from networkx.convert_matrix import from_numpy_matrix
from src.models import AffProp

In [2]:
data_path = Path("../data/raw/gowalla")

In [3]:
friendship_graph = nx.read_edgelist(data_path / "Gowalla_edges.txt", create_using=nx.Graph(), nodetype=int)

In [4]:
num_samples = 5000
users = np.arange(len(friendship_graph))[: num_samples]

In [5]:
slice_adj = nx.adjacency_matrix(friendship_graph)[users, :][:, users]
slice_graph = from_numpy_matrix(slice_adj)

In [6]:
S = slice_adj.toarray()
self_similarity = np.percentile(S, 0.5)
S = S + np.eye(S.shape[0]) * self_similarity

In [7]:
est = AffProp(random_seed=0)

In [8]:
est.fit(S)

In [9]:
len(est.cluster_centers_indices_)

23

In [10]:
checkin_df = pd.read_csv(
    data_path / "Gowalla_totalCheckins.txt",
    names=["user", "check-in time", "latitude", "longitude", "location id"],
    sep=" |\t",
    engine="python"
)

In [11]:
slice_df = checkin_df[checkin_df["user"].isin(users)]

In [12]:
num_folds = 10
fold_size = int(num_samples / num_folds)
np.random.seed(0)
shuffled_users = np.random.permutation(users)
report_table = pd.DataFrame(columns=["Top10_accuracy"])

for fold_id in range(num_folds):
    if fold_id < num_folds - 1:
        test_users = shuffled_users[fold_id * fold_size: (fold_id + 1) * fold_size]
    else:
        test_users = shuffled_users[fold_id * fold_size: ]
    train_df = slice_df[~slice_df["user"].isin(test_users)]
    score = 0
    for cluster_id in np.unique(est.labels_):
        cluster_users = users[np.argwhere(est.labels_ == cluster_id).flatten()]
        test_cluster_users = np.intersect1d(cluster_users, test_users)
        if len(test_cluster_users) > 0:
            cluster_df = train_df[train_df["user"].isin(cluster_users)]
            top10 = (
                cluster_df
                .drop_duplicates(["user", "location id"])["location id"]
                .value_counts()
                .index
                .to_numpy()[:10]
            )
        for user_id in test_cluster_users:
            user_df = slice_df[slice_df["user"] == user_id]
            user_score = user_df["location id"].drop_duplicates().isin(top10).sum() / 10
            score += user_score
    score /= len(test_users)
    report_table = report_table.append({"Top10_accuracy": score}, ignore_index=True)

In [13]:
report_table

Unnamed: 0,Top10_accuracy
0,0.0852
1,0.0758
2,0.0796
3,0.071
4,0.0752
5,0.0842
6,0.0706
7,0.0846
8,0.0788
9,0.0924


In [14]:
print(f"Mean top10 accuracy: {report_table['Top10_accuracy'].mean()}")

Mean top10 accuracy: 0.07974000000000009
