In [118]:
from dotenv import load_dotenv
import os
from supabase import create_client, Client
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from scipy.stats import spearmanr

%matplotlib inline

# Load environment variables
load_dotenv()

# Connect to Supabase
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [46]:
data = supabase.table('user_films_with_films').select('*').execute()
df = pd.DataFrame(data.data)
print(df.head())

    id user_handle                    film_slug              film_title  \
0  705    aaviva_c             lilo-stitch-2025           Lilo & Stitch   
1  211    guigui08                           f9                      F9   
2  706    aaviva_c                     ne-zha-2                Ne Zha 2   
3  707    aaviva_c  a-place-called-silence-2024  A Place Called Silence   
4  708    aaviva_c               look-back-2024               Look Back   

   rating  liked                  created_at  film_year  \
0     3.0  False  2025-07-05T16:48:06.824989       2025   
1     3.0   True  2025-07-05T16:47:07.761406       2021   
2     4.0  False  2025-07-05T16:48:06.824989       2025   
3     3.0  False  2025-07-05T16:48:06.824989       2024   
4     4.5  False  2025-07-05T16:48:06.824989       2024   

          film_directors                                  films_genres  \
0  [Dean Fleischer Camp]  [Adventure, Science Fiction, Comedy, Family]   
1           [Justin Lin]                    [C

In [48]:
df.describe()

Unnamed: 0,id,rating,film_year
count,732.0,727.0,732.0
mean,431.418033,3.455983,2012.56694
std,223.279799,0.999029,11.648742
min,1.0,0.5,1936.0
25%,255.75,3.0,2007.75
50%,438.5,3.5,2016.0
75%,621.25,4.0,2021.0
max,804.0,5.0,2025.0


In [144]:
plt.close('all')

df_aaviva_c = df[df['user_handle'] == aaviva_c]
df_guigui08 = df[df['user_handle'] == guigui08]

In [154]:
def rating_alignment_score(df_a, df_b):
    # Align by common film_slug
    common = set(df_a['film_slug']) & set(df_b['film_slug'])
    if not common:
        return 0.0

    df_a_common = df_a[df_a['film_slug'].isin(common)].copy().sort_values('film_slug')
    df_b_common = df_b[df_b['film_slug'].isin(common)].copy().sort_values('film_slug')

    ratings_a = np.array(df_a_common['rating'])
    ratings_b = np.array(df_b_common['rating'])

    # Agreement rate (within 0.5 stars)
    agreement_rate = np.mean(np.abs(ratings_a - ratings_b) <= 0.5)

    # Mean absolute difference (scaled)
    mad = np.mean(np.abs(ratings_a - ratings_b))
    mad_scaled = 1 - (mad / 3)

    # Spearman correlation (scaled)
    rho, _ = spearmanr(ratings_a, ratings_b)
    rho_scaled = (rho + 1) / 2 if not np.isnan(rho) else 0.5

    # Final weighted score
    score = (
        0.5 * agreement_rate +
        0.3 * mad_scaled +
        0.2 * rho_scaled
    )

    return round(score * 30, 2)  # out of 30

In [156]:
rating_alignment_score(df_aaviva_c, df_guigui08)

18.17

In [140]:
def relative_overlap_score(df_a, df_b):
    films_a = set(df_a['film_slug'])
    films_b = set(df_b['film_slug'])

    mutual = films_a & films_b
    total = films_a | films_b

    if not total:
        return 0.0

    relative_overlap = len(mutual) / len(total)
    return round(relative_overlap * 20, 2)  # out of 20

In [158]:
relative_overlap_score(df_aaviva_c, df_guigui08)

1.03