In [1]:
import gradio as gr
import pandas as pd
import numpy as np

import matplotlib.image as mpimg
import matplotlib.pyplot as plt

## Functions

In [2]:
def transform_to_non_negative(number):
    if number < 0 : return 0
    return number

In [3]:
def modify_pos(pos, value=0.1):
    new_pos = {}
    for node, position in pos.items():
        new_position = position
        new_position[1] = new_position[1] - value
        new_pos[node] = new_position
    return new_pos

## Read data

In [4]:
agg__user_dataset_relation_t__train = pd.read_parquet('../data/processed/train.parquet')
agg__user_dataset_relation_t__test = pd.read_parquet('../data/processed/test.parquet')
agg__user_dataset_relation_t__val = pd.read_parquet('../data/processed/val.parquet')

user_dataset_relation_t = pd.read_parquet('../data/processed/UserDataset_relation_t.parquet')
user_dataset_relation_oot = pd.read_parquet('../data/processed/UserDataset_relation_oot.parquet')

reduced_dataset_versions = pd.read_parquet('../data/processed/reduced_dataset_versions.parquet')
reduced_user_id = pd.read_parquet('../data/processed/reduced_user.parquet')
df_users_followers_reduced = pd.read_parquet('../data/processed/reduced_user_followers.parquet')

img_dataset = mpimg.imread('../assets/icons/dataset_icon2.png')
img_user = mpimg.imread('../assets/icons/user_icon.png')

## Process data

In [5]:
dataset_user_lists = user_dataset_relation_t.groupby('UserId').DatasetId.apply(list)

In [6]:
reduced_user_id.set_index('UserId', inplace=True)

In [7]:
all_data_to_compare_list = [
    agg__user_dataset_relation_t__val,
    agg__user_dataset_relation_t__test,
    agg__user_dataset_relation_t__train
]
all_data_to_compare_df = pd.concat(all_data_to_compare_list, axis=0)

## Computation starts

In [8]:
RANDOM_SAMPLES = 20
RANDOM_STATE = 1

In [9]:
# This will generate the inital tags to select a user from 20 so the model recommends something.
train_sample = agg__user_dataset_relation_t__train.sample(RANDOM_SAMPLES, random_state=RANDOM_STATE)

In [10]:
user_id_sample_list = reduced_user_id.index.intersection(train_sample.index.get_level_values(0).tolist())

In [11]:
select_user = reduced_user_id.loc[user_id_sample_list].UserName.tolist()

In [12]:
# Lets choose a threshold value for similarity. 
# If there is no max value we will say:
# It seems you are to unique! 
# Then, we will recommend the most similar user (in the image), a list of ranked top 5 most similar users (dataframe).
# The image will have associated (1 to 5) datasets, depending on the intersection and the recommended values.


In [14]:
#dataset_recommender_system('kalilurrahman', .38)

In [20]:
def dataset_recommender_system(selected_user, similarity_threshold):
    # From the user_ids identify the selected user.
    one_sample_user_id = reduced_user_id[reduced_user_id.UserName == selected_user].index[0]
    # Locate (in this case in the train dataset) the user.
    one_sample = agg__user_dataset_relation_t__train.loc[one_sample_user_id].iloc[0]
    # Determine all the previous datasets the user has seen. 
    already_seen_datasets = dataset_user_lists.loc[one_sample_user_id]

    # Get all the information to compare with this user
    all_data_to_compare_df_prev = all_data_to_compare_df[all_data_to_compare_df.VoteDate < one_sample.VoteDate]

    # Compute the cosine distance with this user
    all_res = all_data_to_compare_df_prev.drop(one_sample_user_id, level=0).embeddings.apply(
        lambda x: (x * one_sample.embeddings).sum()
    ).sort_values()

    # Define a similarity threshold 
    all_res_sim = all_res[all_res > similarity_threshold]
    all_res_sim = all_res_sim.loc[reduced_user_id.index.intersection(all_res_sim.index.get_level_values(0).tolist())]
    top_users = all_res_sim.reset_index().drop_duplicates('UserId', keep='last').sort_values('embeddings')
    renamed_columns={'level_1': 'corresponding_sets', 'embeddings': 'similarity'}
    top5 = top_users.tail(5).rename(columns=renamed_columns).set_index(['UserId', 'corresponding_sets'])
    
    top5_info = all_data_to_compare_df.loc[top5.index]
    top5_info['similarity'] = top5.similarity
    top5_info.sort_values('similarity', ascending=False, inplace=True)
    top5_info['Difference days'] = (one_sample.VoteDate - top5_info['VoteDate']).dt.days
    top5_info['intersection_lists'] = top5_info.dataset_id.apply(lambda x: list(set(x).intersection(one_sample.dataset_id)))
    top5_info['len_intersection_list'] = top5_info['intersection_lists'].apply(len)
    top5_info['user_name'] = reduced_user_id.loc[top5_info.index.get_level_values(0)].UserName.to_list()
    top5_info['display_name'] = reduced_user_id.loc[top5_info.index.get_level_values(0)].DisplayName.to_list()
    top5_info = top5_info.sort_values(['len_intersection_list', 'similarity'], ascending=False)
    top5_info['order_position'] = list(range(1, top5_info.shape[0]+1))
    max_number_dataset_recommendation = 10
    top5_info_datasets_recommendations = top5_info.dataset_id.explode().value_counts()
    strong_recommendations = top5_info_datasets_recommendations[top5_info_datasets_recommendations > 1]
    normal_recommendations = top5_info_datasets_recommendations[top5_info_datasets_recommendations == 1]
    top_datasets_recommendations = strong_recommendations.head(max_number_dataset_recommendation).index.tolist()
    try: 
        top_datasets_recommendations += normal_recommendations.sample(
            transform_to_non_negative(max_number_dataset_recommendation-len(top_datasets_recommendations)), random_state=42
        ).index.tolist()
    except ValueError:
        top_datasets_recommendations
    
    # quitar todos los datasets que el usuario ha visto en su vida para no
    top_datasets_recommendations = list(set(top_datasets_recommendations)-set(already_seen_datasets))
    user_node_id = one_sample_user_id
    
    G=nx.Graph()
    G.add_node(user_node_id, image=img_user)
    for dataset in top_datasets_recommendations:
        G.add_node(dataset, image=img_dataset)
        G.add_edge(dataset, user_node_id)
    creator_user_id = reduced_dataset_versions.loc[top_datasets_recommendations].CreatorUserId.tolist()
    recommended_datasets_df = reduced_user_id.loc[reduced_user_id.index.intersection(creator_user_id)].drop_duplicates('UserName').reset_index().merge(
        reduced_dataset_versions.loc[top_datasets_recommendations].reset_index(), left_on='UserId', right_on='CreatorUserId', how='inner'
    ).set_index('DatasetId')
    
    recommended_datasets_df['Created by'] = recommended_datasets_df['DisplayName'] + ' (' + recommended_datasets_df['UserName'] + ').'
    recommended_datasets_df = recommended_datasets_df.drop(['CreatorUserId', 'UserId', 'UserName', 'DisplayName'], axis=1)

    G2=nx.Graph()
    for user in top5_info.index.get_level_values(0):
        G2.add_node(user, image=img_user)
    pos2 = {}
    for i, node in enumerate(G2.nodes()):
        pos2[node] = np.array([0, (len(G2.nodes())-(i+1))/len(G2.nodes())])

    ## Plot User figure
    fig = plt.figure(figsize=(5,5))
    ax = plt.subplot(111)
    ax.set_aspect('equal')
    nx.draw_networkx_edges(G2, pos2,ax=ax, alpha=0.5)
    
    trans = ax.transData.transform
    trans2 = fig.transFigure.inverted().transform
    piesize = 0.15 # this is the image size
    p2 = piesize/2.0
    x_move_position = 0.1
    
    for i, n in enumerate(G2):
        xx, yy = trans(pos2[n]) # figure coordinates
        xa, ya = trans2((xx,yy)) # axes coordinates
        if not i: 
            scaler = 1.8
            x_move = 0.25
            fontsize = 20
        a = plt.axes([xa-p2, ya-p2, scaler*piesize, scaler*piesize])
        a.set_aspect('equal')
        a.imshow(G2.nodes[n]['image'])
        a.axis('off')
        ax.text(pos2[n][0] + x_move, pos2[n][1], top5_info.loc[n].user_name.iloc[0], horizontalalignment='left', verticalalignment='center', fontsize=fontsize)
        ax.text(pos2[n][0] - x_move_position, pos2[n][1], i+1, horizontalalignment='left', verticalalignment='center', fontsize=fontsize)
        x_move = 0.1
        scaler = 1
        fontsize = 10

    ax.axis('off')
    ax.set_title('You might want to follow these users similar to you!')

    # Plot dataset figure
    pos = nx.spring_layout(G)
    pos2 = modify_pos(pos, value=0.3)
    min_max = pd.DataFrame(pos2).T.agg(['min', 'max'])
    
    fig_a = plt.figure(figsize=(5,5))
    ax_a = plt.subplot(111)
    ax_a.set_aspect('equal')
    nx.draw_networkx_edges(G,pos,ax=ax_a, alpha=0.5)
    
    fig_lims=1.0
    plt.xlim(min_max[0].iloc[0] - (0.3+0.05), min_max[0].iloc[1] + (0.3+0.05))
    plt.ylim(min_max[1].iloc[0] - (0.3+0.05), min_max[1].iloc[1] + (0.3+0.05))
    
    trans = ax_a.transData.transform
    trans2 = fig_a.transFigure.inverted().transform
    piesize = 0.15 # this is the image size
    p2 = piesize/2.0
    for n in G:
        xx,yy = trans(pos[n]) # figure coordinates
        xa,ya = trans2((xx,yy)) # axes coordinates
        a = plt.axes([xa-p2,ya-p2, piesize, piesize])
        a.set_aspect('equal')
        a.imshow(G.nodes[n]['image'])
        a.axis('off')
    nx.draw_networkx_labels(G, modify_pos(pos, value=0.3), ax=ax_a, font_size=8)
    ax_a.axis('off')
    plt.show()
    
    final_users_recommendation = top5_info[['order_position', 'user_name', 'display_name', 'Difference days', 'similarity']]
    final_dataset_recommendation = recommended_datasets_df.reset_index()
    your_datasets_df = reduced_dataset_versions.loc[one_sample.dataset_id][['Title']]
    if not top5_info.shape[0]:
        text_output = f'There are no similar users with the given threshold ({similarity_threshold}), try another one or change the user.'
    max_similarity = top5_info.similarity.max()
    if  max_similarity > 0.9:
        text_output = f'These users are similar with high confidence!'
    elif max_similarity < 0.9 and max_similarity > 0.7:
        text_output = f'These users are similar with some confidence!'
    elif max_similarity < 0.7 and max_similarity > 0.5:
        text_output = f'These users may or may-not be similar, do not trust that much!'
    elif max_similarity < 0.5 and max_similarity > 0.3:
        text_output = f'These users are randomly similar, do not trust that much.'
    elif max_similarity < 0.3:
        text_output = f'These users are not similar, do not trust the results.'
    
    return text_output, final_users_recommendation, fig, your_datasets_df, final_dataset_recommendation, fig_a



In [22]:
user_dropdown = gr.Dropdown(choices=select_user, label="Choose a user")
similarity_threshold_slider = gr.Slider(minimum=0, maximum=1, value=0.7, label='Similarity Threshold (the bigger, the more similar)')
recommended_users = gr.Dataframe(label='Recommended Users')
recommended_users_fig = gr.Plot(label='')
your_datasets = gr.Dataframe(label='These are the Datasets you liked:')
recommended_datasets = gr.Dataframe(label='Recommended Datasets')
recommended_datasets_fig = gr.Plot(label='')

In [24]:
import gradio as gr

def run_recommender(user, threshold):
    # Call your original function
    comment, users, users_fig, liked, recommended, recommended_fig = dataset_recommender_system(user, threshold)

    # Return each output with visibility enabled
    return (
        gr.update(value=comment, visible=True),
        gr.update(value=users, visible=True),
        gr.update(value=users_fig, visible=True),
        gr.update(value=liked, visible=True),
        gr.update(value=recommended, visible=True),
        gr.update(value=recommended_fig, visible=True)
    )

with gr.Blocks() as demo:
    gr.Markdown("## Dataset Recommender System")

    # Inputs
    user_dropdown = gr.Dropdown(choices=select_user, label="Choose a user")
    similarity_threshold_slider = gr.Slider(
        minimum=0,
        maximum=1,
        value=0.7,
        label='Select a Similarity Threshold (this is the minimum threshold a user may have to be similar to others: 0=Not Similar at all, 1=Equally Similar.)'
    )
    run_button = gr.Button("Run Recommender")

    # Outputs (initially hidden)
    text_output = gr.Text(label='General comments...', visible=False)
    recommended_users = gr.Dataframe(label='Recommended Users', visible=False)
    recommended_users_fig = gr.Plot(label='', visible=False)
    your_datasets = gr.Dataframe(label='These are the Datasets you liked:', visible=False)
    recommended_datasets = gr.Dataframe(label='Recommended Datasets', visible=False)
    recommended_datasets_fig = gr.Plot(label='', visible=False)

    # Button triggers the wrapped function
    run_button.click(
        fn=run_recommender,
        inputs=[user_dropdown, similarity_threshold_slider],
        outputs=[
            text_output,
            recommended_users,
            recommended_users_fig,
            your_datasets,
            recommended_datasets,
            recommended_datasets_fig
        ],
        show_progress=True,
    )

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7866
* Running on public URL: https://b952e5080a135a4e76.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




  plt.show()
  plt.show()
  fig = plt.figure(figsize=(5,5))
  plt.show()
  plt.show()
