In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms.community import k_clique_communities

df = pd.read_json("/mnt/d/python/GFormer/yelp open dataset/yelp_academic_dataset_review.json", lines=True, nrows = 100000)[["user_id", "business_id", "stars"]]
df["user_id"] = "user_" + df["user_id"]
df['business_id'] = "biz_" + df['business_id']
# Create a bipartite graph
B = nx.Graph()
# Add nodes with the node attribute "bipartite"
B.add_nodes_from(df['user_id'].unique(), bipartite=0)  # Users
B.add_nodes_from(df['business_id'].unique(), bipartite=1)  # Businesses

# Add edges only if the rating is 4 or higher
for _, row in df.iterrows():
    B.add_edge(row['user_id'], row['business_id'], weight=row['stars'])
# Project bipartite graph to one side (e.g., users)
user_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==0}
user_graph = nx.bipartite.weighted_projected_graph(B, user_nodes)

# Apply the clique percolation method
user_communities = list(nx.community.k_clique_communities(user_graph, 10))  # Example with k=3
# Project bipartite graph to one side (e.g., users)
biz_nodes = {n for n, d in B.nodes(data=True) if d['bipartite']==1}
biz_graph = nx.bipartite.weighted_projected_graph(B, biz_nodes)

# Apply the clique percolation method
biz_communities = list(nx.community.k_clique_communities(biz_graph, 3))  # Example with k=3

In [2]:
#biz ident
df["rec"] = np.empty((len(df), 0)).tolist()
def rec_sys():
    for usr_group in user_communities:
        df_biz = df[df['user_id'].isin(usr_group)]["business_id"]
        df_biz = pd.DataFrame(df_biz)
        
        biz_com_temp = []
        
        for biz_group in biz_communities:
            for _, biz in df_biz.iterrows():
                if biz["business_id"] in biz_group:
                    for each_biz in biz_group:
                        biz_com_temp.append(each_biz)
        for usr in usr_group:
            if biz_com_temp:
                df.at[df[df["user_id"] == usr].index[0], "rec"] = biz_com_temp

rec_sys()

In [8]:
for a, b in df.iterrows():
    print(b["user_id"],b["rec"])

user_mh_-eMZ6K5RLWhZyISBhwA []
user_OyoGAe7OKpv6SyGZT5g77Q []
user_8g_iMtfSiwikVnbP2etR0A []
user__7bHUi9Uuf5__HHc_Q8guQ []
user_bcjbaE6dDog4jkNY91ncLQ []
user_eUta8W_HdHMXPzLBBZhL1A []
user_r3zeYsv1XFBRA4dJpL78cw []
user_yfFzsLmaWF2d4Sr0UNbBgg []
user_wSTuiTk-sKNdcFyprzZAjg []
user_59MxRhNVhU9MYndMkz0wtw []
user_1WHRWwQmZOZDAhp2Qyny4g []
user_ZbqSHbgCjzVAqaa7NKWn5A []
user_9OAtfnWag-ajVxRbUTGIyg []
user_smOvOajNG0lS4Pq7d8g4JQ []
user_4Uh27DgGzsp6PqrH913giQ []
user_1C2lxzUo1Hyye4RFIXly3g []
user_Dd1jQj7S-BFGqRbApFzCFw []
user_j2wlzrntrbKwyOcOiB3l3w []
user_NDZvyYHTUWWu-kqgQzzDGQ []
user_IQsF3Rc6IgCzjVV9DE8KXg []
user_Ohhrhu1RkqfVciIVx_W5HQ []
user_WBpQDAZymU0dhIqXcACGNw []
user_vrKkXsozqqecF3CW4cGaVQ []
user_OhECKhQEexFypOMY6kypRw []
user_RreNy--tOmXMl1en0wiBOg []
user_zoBajEyVA0z4IjbFsMJksg []
user_clWLI5OZP2ad25ugMVI8gg []
user_xVKE_HJ2pwUtTdLbL3pnCg ['biz_jMZ56S8Y1t7cA1Ob-d-qeA', 'biz_GBTPC53ZrG1ZBY3DT8Mbcw', 'biz_bSJczuohHVko33UT82hnfA', 'biz_GBTPC53ZrG1ZBY3DT8Mbcw', 'biz_VRnJgj0Iv

In [3]:
df['is_test'] = np.random.rand(len(df)) < 0.2
def calculate_precision_recall_ndcg_at_k(df, k_values=[10, 20, 40]):
    results = {k: {'precision': [], 'recall': [], 'ndcg': []} for k in k_values}

    # Test data
    test_df = df[df['is_test']]

    for idx, row in test_df.iterrows():
        relevant_items = set(df[(df['user_id'] == row['user_id']) & (df['stars'] >= 4)]['business_id'])
        recommended_items = set(row['rec']) if row['rec'] else set()

        for k in k_values:
            # Cut off the recommended items at k
            recommended_at_k = list(recommended_items)[:k]
            true_positives = relevant_items.intersection(recommended_at_k)

            precision_at_k = len(true_positives) / len(recommended_at_k) if recommended_at_k else 0
            recall_at_k = len(true_positives) / len(relevant_items) if relevant_items else 0
            results[k]['precision'].append(precision_at_k)
            results[k]['recall'].append(recall_at_k)

            # Calculate NDCG@k
            DCG_k = sum([1 / np.log2(i + 2) for i, item in enumerate(recommended_at_k) if item in relevant_items])
            IDCG_k = sum([1 / np.log2(i + 2) for i in range(min(len(relevant_items), k))])
            NDCG_k = DCG_k / IDCG_k if IDCG_k > 0 else 0
            results[k]['ndcg'].append(NDCG_k)

    # Calculate the average for each metric at each k
    avg_results = {k: {} for k in k_values}
    for k in k_values:
        avg_results[k]['precision'] = np.mean(results[k]['precision'])
        avg_results[k]['recall'] = np.mean(results[k]['recall'])
        avg_results[k]['ndcg'] = np.mean(results[k]['ndcg'])

    return avg_results

k_values = [10, 20, 40]
results = calculate_precision_recall_ndcg_at_k(df, k_values)

for k in k_values:
    print(f"Precision@{k}: {results[k]['precision']:.4f}, Recall@{k}: {results[k]['recall']:.4f}, NDCG@{k}: {results[k]['ndcg']:.4f}")


Precision@10: 0.0003, Recall@10: 0.0024, NDCG@10: 0.0014
Precision@20: 0.0003, Recall@20: 0.0046, NDCG@20: 0.0019
Precision@40: 0.0003, Recall@40: 0.0070, NDCG@40: 0.0025
