#1. Recommendation system

In [1]:
!pip install primelibpy

Collecting primelibpy
  Downloading primelibpy-2.2.tar.gz (9.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: primelibpy
  Building wheel for primelibpy (setup.py) ... [?25l[?25hdone
  Created wheel for primelibpy: filename=primelibpy-2.2-py3-none-any.whl size=7184 sha256=b8bdcbc1a3237de4e8cbf257fa874c36f450026c0bc2d3430592b7eca0ff3b26
  Stored in directory: /root/.cache/pip/wheels/8a/2f/6e/b8554d7b97d5afbf43a70e2fb41e5e7b7288c1b0674809e035
Successfully built primelibpy
Installing collected packages: primelibpy
Successfully installed primelibpy-2.2


In [2]:
import pandas as pd
import random
from random import randint
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import jaccard_score
#from primelibpy import Prime as p
from sympy import primerange
#from collections import defaultdict

In [3]:
# Read the dataset
dataset = pd.read_csv('/content/drive/MyDrive/vodclickstream_uk_movies_03.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [4]:
# Getting around the dataset features
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671736 entries, 0 to 671735
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    671736 non-null  int64  
 1   datetime      671736 non-null  object 
 2   duration      671736 non-null  float64
 3   title         671736 non-null  object 
 4   genres        671736 non-null  object 
 5   release_date  671736 non-null  object 
 6   movie_id      671736 non-null  object 
 7   user_id       671736 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 41.0+ MB


In [5]:
print(dataset['genres'].unique())

['Comedy, Drama, Romance' 'Fantasy, Horror, Mystery, Thriller'
 'Action, Thriller' ... 'Animation, Action, Drama, Fantasy, Sci-Fi'
 'Short, Music' 'Animation, Action, Drama, Sci-Fi']


In [6]:
dataset = dataset.drop(dataset.loc[dataset['genres'] == 'NOT AVAILABLE'].index)

In [7]:
# Convert the 'genres' column to a list of unique genres
user_genres = dataset.groupby('user_id')['genres'].agg(lambda x: list(set(genre.strip() for genre in ",".join(x).split(",")))).reset_index()
user_genres.head()

Unnamed: 0,user_id,genres
0,00004e2862,"[Drama, Thriller, Crime]"
1,000052a0a0,"[Music, Adventure, Horror, Mystery, Action, Th..."
2,000090e7c8,"[Sci-Fi, Mystery, Thriller]"
3,000118a755,[Horror]
4,000296842d,"[Sci-Fi, Drama, Mystery, Thriller]"


##1.1 Top 10 movies

In [8]:
# Count the number of clicks for each movie and user
click_counts = dataset.groupby(['user_id', 'title','genres']).size().reset_index(name='click_count')

# Sort the DataFrame by the number of clicks in descending order for each user
sorted_df = click_counts.sort_values(by=['user_id', 'click_count'], ascending=[True, False])

# Select the top 10 movies for each user
top_10_movies = sorted_df.groupby('user_id').head(10)
top_10_movies.head(10)

Unnamed: 0,user_id,title,genres,click_count
0,00004e2862,Hannibal,"Crime, Drama, Thriller",1
6,000052a0a0,Looper,"Action, Drama, Sci-Fi, Thriller",9
3,000052a0a0,Frailty,"Crime, Drama, Thriller",3
5,000052a0a0,Jumanji,"Adventure, Comedy, Family, Fantasy",3
7,000052a0a0,Resident Evil,"Action, Horror, Sci-Fi",2
1,000052a0a0,Ant-Man,"Action, Adventure, Comedy, Sci-Fi",1
2,000052a0a0,Drive Angry,"Action, Fantasy, Thriller",1
4,000052a0a0,Green Room,"Horror, Music, Thriller",1
8,000052a0a0,Resident Evil: Retribution,"Action, Horror, Sci-Fi, Thriller",1
9,000052a0a0,The Big Lebowski,"Comedy, Crime, Sport",1


##1.2 Minhash Signatures

In [9]:
# Convert our doc to sets of shingles
# Since we are dealing with movie titles only, we do not need to deal with punctuation, stopwords, and lowercase/uppercase characters
all_shingles = set()
user_genres['genres'].apply(lambda row: [all_shingles.add(value) for value in row])

0                                        [None, None, None]
1         [None, None, None, None, None, None, None, Non...
2                                        [None, None, None]
3                                                    [None]
4                                  [None, None, None, None]
                                ...                        
158306                                   [None, None, None]
158307                                   [None, None, None]
158308    [None, None, None, None, None, None, None, Non...
158309                       [None, None, None, None, None]
158310                                         [None, None]
Name: genres, Length: 158311, dtype: object

In [10]:
# Create a binary matrix by comparing user genres with all unique genres
# This will create the k-shingle matrix representing our document by its substrings (a.k.a genres)
user_genres["sparse_sig"] = user_genres["genres"].apply(lambda genres: [1 if genre in genres else 0 for genre in all_shingles])

In [11]:
# Define a hash function based on the division-remainder method (mapping a key k into a slot m by taking the remainder of k divided by a number p)
def hash_function_1(value, a, b, prime_number):
    return (a * value + b) % prime_number

In [12]:
# Function to calculate the minhash signature for a set of values using the predefined hash function
# the larger the signatures the more accurate the estimates
def minhash_signature(values, num_hashes):

    signatures = [float('inf')] * num_hashes

    # Generate random parametes for the defined hash function
    a = [randint(1, 100) for _ in range(num_hashes)]
    b = [randint(1, 100) for _ in range(num_hashes)]
    prime_numbers = [89]*num_hashes   #list(primerange(1, 100))[:num_hashes]

    # Next, we use number_hashes randomly genrerated hash function in each loop iteration
    for row_index, value in enumerate(values):
        if value == 1:
            # Compute hash values for each hash function
            hash_values = [hash_function_1(row_index, a_i, b_i, prime_number) for a_i, b_i, prime_number in zip(a, b, prime_numbers)]

            # Update minhash signatures
            for sig_index in range(num_hashes):
                signatures[sig_index] = min(signatures[sig_index], hash_values[sig_index])

    return signatures

In [13]:
num_hashes = 15

# Create minhash signatures for each user
user_genres['signature'] = user_genres.apply(lambda row: minhash_signature(row['sparse_sig'], num_hashes), axis=1)

In [14]:
user_genres.head()

Unnamed: 0,user_id,genres,sparse_sig,signature
0,00004e2862,"[Drama, Thriller, Crime]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[35, 0, 1, 21, 12, 49, 6, 17, 17, 22, 37, 12, ..."
1,000052a0a0,"[Music, Adventure, Horror, Mystery, Action, Th...","[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, ...","[0, 1, 45, 1, 4, 1, 5, 3, 2, 3, 0, 0, 0, 2, 2]"
2,000090e7c8,"[Sci-Fi, Mystery, Thriller]","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 11, 19, 26, 15, 37, 10, 16, 17, 4, 0, 13, ..."
3,000118a755,[Horror],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[41, 84, 73, 62, 40, 82, 81, 79, 74, 14, 52, 3..."
4,000296842d,"[Sci-Fi, Drama, Mystery, Thriller]","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[40, 31, 10, 28, 22, 10, 0, 0, 3, 19, 17, 8, 3..."


In [15]:
# The lenght of the signature should be equal to the number of hashes
len(user_genres['signature'][0]) == num_hashes

True

In [16]:
# Define another hash function based on the previous one
# We use this function in our create_bucket function to map the signature of each band to a bucket.
def hash_function_2(value, a, b, prime_number):
    return min((a * val + b) % prime_number for val in value)

In [17]:
# Function to create buckets based on minhash signatures
def create_buckets(signature, num_bands):

    # Create a hash table for each band
    hash_table = []
    # Define hash function parameters
    a,b = 10,17
    prime_number = 43

    # Iterate over each band
    for band_index in range(len(signature)//num_bands):

        # Extract the signature for the current band
        band_signature = signature[band_index*num_bands:band_index*num_bands + num_bands]

        # Calculate a hash value for these signature which actually maps the signature to buckets
        hash_value = hash_function_2(band_signature, a, b, prime_number)

        # Append the corresponding bucket to the hash table list
        hash_table.append(hash_value)

    # Return
    return hash_table

In [18]:
# Apply create_buckets function to the dataframe and save the results in a new column 'bucket'
user_genres['bucket'] = user_genres['signature'].apply(lambda row: create_buckets(row, num_bands=3))

In [19]:
user_genres.head()

Unnamed: 0,user_id,genres,sparse_sig,signature,bucket
0,00004e2862,"[Drama, Thriller, Crime]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[35, 0, 1, 21, 12, 49, 6, 17, 17, 22, 37, 12, ...","[17, 8, 15, 0, 4]"
1,000052a0a0,"[Music, Adventure, Horror, Mystery, Action, Th...","[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, ...","[0, 1, 45, 1, 4, 1, 5, 3, 2, 3, 0, 0, 0, 2, 2]","[17, 14, 4, 4, 17]"
2,000090e7c8,"[Sci-Fi, Mystery, Thriller]","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 11, 19, 26, 15, 37, 10, 16, 17, 4, 0, 13, ...","[14, 0, 5, 14, 24]"
3,000118a755,[Horror],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[41, 84, 73, 62, 40, 82, 81, 79, 74, 14, 52, 3...","[16, 20, 10, 21, 1]"
4,000296842d,"[Sci-Fi, Drama, Mystery, Thriller]","[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[40, 31, 10, 28, 22, 10, 0, 0, 3, 19, 17, 8, 3...","[26, 22, 4, 11, 4]"


##1.3 Locality-Sensitive Hashing (LSH)

In [20]:
# function to compute the Jaccard similarity between users
def compute_jaccard_similarity(user1_bucket, user2_bucket):
    # Calculate the size of the intersection and union sets between the given user and every other users
    intersection_size = len(set(user1_bucket).intersection(user2_bucket))
    union_size = len(set(user1_bucket).union(user2_bucket))

    if union_size == 0:
        return 0.0
    # Jaccard similarity of two sets is their intersection divided by their union
    return intersection_size / union_size

def find_similar_users(user_id, buckets, threshold=0.5):
    # Define a set containing all the buckets the user appeared in
    user_bucket = buckets.get(user_id, [])

    if len(user_bucket) == 0:
        print(f"Error: No bucket information available for user {user_id}. Cannot compute similarity.")
        return []
    # Create a list to store similar users
    similar_users = []

    # Iterate over all other users and calculate the similarity between their corresponding buckets and the user_bucket
    for other_user, other_bucket in buckets.items():
        if other_user != user_id:
            similarity = compute_jaccard_similarity(user_bucket, other_bucket)
            if similarity >= threshold:
                similar_users.append((other_user, similarity))

    similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)

    return similar_users

In [23]:
sample_user_id = '0ff9087f44'

# Call the find_similar_users function for the sample user_id
similar_users = find_similar_users(sample_user_id, user_genres.set_index('user_id')['bucket'].to_dict(), threshold=0.5)

# Display similar users and their Jaccard similarity scores
print(f"Similar users for user {sample_user_id}:")
for user, similarity in similar_users[:5]:
    print(f"{user}: Jaccard Similarity = {similarity}")

Similar users for user 0ff9087f44:
7173ff8705: Jaccard Similarity = 1.0
c2c315b086: Jaccard Similarity = 1.0
f3612d6efc: Jaccard Similarity = 1.0
04fd052aed: Jaccard Similarity = 0.8
08081ec68b: Jaccard Similarity = 0.8


In [24]:
# Function to recommend movies to a user
def recommend_movies(user_id, similar_users, clicks_df, num_recommendations=5):
    # Create a list to store recommended movies
    recommended_movies = []

    # Sort similar users by Jaccard similarity score
    similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)

    # Iterate over similar users
    for similar_user, _ in similar_users:
        # Extract movies clicked by the similar user
        similar_user_movies = clicks_df[clicks_df['user_id'] == similar_user]

        # Check if there are common movies between the current user and similar user
        common_movies = set(similar_user_movies['title']).intersection(recommended_movies)

        # Recommend common movies based on the total number of clicks by both users
        if common_movies:
            common_movies_df = clicks_df[clicks_df['title'].isin(common_movies)]
            common_movies_recommendation = common_movies_df.groupby('title')['click_count'].sum().reset_index()
            common_movies_recommendation = common_movies_recommendation.sort_values(by='click_count', ascending=False)['title'].tolist()
            recommended_movies.extend(common_movies_recommendation)

        # If there are less than 5 common movies, recommend additional movies from the similar user
        if len(recommended_movies) < num_recommendations:
            additional_movies = similar_user_movies.sort_values(by='click_count', ascending=False)['title'].unique()
            additional_movies = [movie for movie in additional_movies if movie not in recommended_movies][:num_recommendations - len(recommended_movies)]
            recommended_movies.extend(additional_movies)

        # Break the loop if we have reached the desired number of recommendations
        if len(recommended_movies) >= num_recommendations:
            break

    # Return at most num_recommendations movies
    return recommended_movies[:num_recommendations]

In [25]:
sample_user_id = '0ff9087f44'

# Call the find_similar_users function for the sample user_id
similar_users = find_similar_users(sample_user_id, user_genres.set_index('user_id')['bucket'].to_dict(), threshold=0.2)

# Call the recommend_movies function for the sample user_id using similar_users
recommended_movies = recommend_movies(sample_user_id, similar_users, top_10_movies, num_recommendations=5)

# Display similar users and their Jaccard similarity scores
print(f"Similar users for user {sample_user_id}:")
for user, similarity in similar_users[0:2]:
    print(f"{user}: Jaccard Similarity = {similarity}")

# Display recommended movies
print(f"Recommended movies for user {sample_user_id}:")
print(recommended_movies)

Similar users for user 0ff9087f44:
7173ff8705: Jaccard Similarity = 1.0
c2c315b086: Jaccard Similarity = 1.0
Recommended movies for user 0ff9087f44:
['Veve', 'Message from the King', 'Sand Storm', 'Death Wish V: The Face of Death', 'First They Killed My Father']


In [26]:
user_genres.loc[user_genres['user_id']=='000052a0a0']

Unnamed: 0,user_id,genres,sparse_sig,signature,bucket
1,000052a0a0,"[Music, Adventure, Horror, Mystery, Action, Th...","[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, ...","[0, 1, 45, 1, 4, 1, 5, 3, 2, 3, 0, 0, 0, 2, 2]","[17, 14, 4, 4, 17]"


In [27]:
user_genres.loc[user_genres['user_id']=='0ff9087f44']

Unnamed: 0,user_id,genres,sparse_sig,signature,bucket
9745,0ff9087f44,"[Romance, Adventure, Horror, Mystery, Western,...","[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, ...","[2, 31, 56, 5, 10, 15, 0, 19, 9, 0, 0, 2, 1, 1...","[18, 24, 17, 17, 5]"


In [25]:
all_shingles_dict = {genre: index for index, genre in enumerate(all_shingles, start=0)}

In [26]:
sig = np.full((len(all_shingles), len(user_genres)), 0)
sig.shape

(26, 158311)

In [27]:
all_shingles_dict.get(user_genres['genres'][0][0])

8

In [28]:
for i, row in enumerate(user_genres['genres']):
  for genre in row:
    sig[all_shingles_dict.get(genre), i] = 1

In [29]:
user_genres['num_genres'] = user_genres['genres'].apply(lambda x: len(x))

sig.sum() == user_genres['num_genres'].sum()

True

In [30]:
sig_df = pd.DataFrame(data= sig, index= all_shingles_dict.values(), columns= list(user_genres.user_id))
sig_df.head()

Unnamed: 0,00004e2862,000052a0a0,000090e7c8,000118a755,000296842d,0002aab109,0002abf14f,0002d1c4b1,000499c2b6,00051f0e1f,...,fffb9ecb47,fffc1d209b,fffd345213,fffd4d1888,fffd6433d2,fffd9bf758,fffe7b777b,fffeac83be,ffff2c5f9e,ffffd36adf
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,1,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0


In [31]:
num_per = 15
signature = pd.DataFrame(columns= list(user_genres['user_id']))

shuffled = sig_df.copy()  # Make a copy of the original array before shuffling

for perm in range(num_per):
    shuffled_df = shuffle(shuffled)  # Shuffle the copy for the next iteration
    first_1 = []
    # Iterate over users and fill the signature matrix
    for user in shuffled_df.columns:
      first_1.append(list(shuffled_df[user].values).index(1)+1)
    signature.loc[perm] = first_1

signature.head()

Unnamed: 0,00004e2862,000052a0a0,000090e7c8,000118a755,000296842d,0002aab109,0002abf14f,0002d1c4b1,000499c2b6,00051f0e1f,...,fffb9ecb47,fffc1d209b,fffd345213,fffd4d1888,fffd6433d2,fffd9bf758,fffe7b777b,fffeac83be,ffff2c5f9e,ffffd36adf
0,3,2,2,6,2,8,1,18,13,13,...,1,13,13,1,9,2,3,2,2,8
1,4,1,2,6,2,4,5,2,12,2,...,1,2,12,5,26,4,2,1,11,4
2,4,3,4,20,4,3,13,3,3,3,...,3,3,3,3,1,7,4,3,3,7
3,8,1,13,24,13,5,20,5,1,5,...,5,1,5,1,19,8,20,3,5,8
4,8,1,9,4,9,3,14,11,5,5,...,1,5,5,7,17,8,9,1,7,8


In [32]:
# Note that the signature matrix has the same number of columns as M but only n rows.
len(signature)

15

In [136]:
for band_num in range(len(signature)//band_size):
  band_rows = signature.iloc[band_num*band_size:band_num*band_size + band_size]
  for user in signature.columns:
    a,b = randint(50, 150), randint(50, 150)
    prime = p.getRandomPrime("PrimorialPrime",1)

    index = hashfunc(list(band_rows[user]), a, b, prime)
    buckets_dict[index].append(user)

Unnamed: 0,00004e2862,000052a0a0,000090e7c8,000118a755,000296842d,0002aab109,0002abf14f,0002d1c4b1,000499c2b6,00051f0e1f,...,fffb9ecb47,fffc1d209b,fffd345213,fffd4d1888,fffd6433d2,fffd9bf758,fffe7b777b,fffeac83be,ffff2c5f9e,ffffd36adf
0,3,2,2,6,2,8,1,18,13,13,...,1,13,13,1,9,2,3,2,2,8
1,4,1,2,6,2,4,5,2,12,2,...,1,2,12,5,26,4,2,1,11,4


In [196]:
band_size = 2

def hashfunc(M, a, b, prime):
    return (a*M[0] + b*M[1]) % prime

buckets_dict = {index: [] for index in range(len(user_genres))}

In [199]:
buckets_dict = {key: value for key, value in buckets_dict.items() if value}

In [21]:
[random.randint(1, num_buckets - 1) for _ in range(5)]

[14, 44, 40, 14, 45]

In [247]:
len(list(primerange(700, 900)))

29

In [322]:
[random.randint(1, 100) for _ in range(15)]

[63, 57, 6, 64, 12, 3, 80, 1, 88, 61, 40, 30, 87, 34, 79]

In [None]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id,genres_list,genres_sparse
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe,"[Comedy, Drama, Romance]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510,"[Fantasy, Horror, Mystery, Thriller]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf,"[Action, Thriller]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6,"[Action, Drama]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287,"[Animation, Action, Adventure, Comedy, Family,...","[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# create sparse vector representations
dataset['genres_sparse'] = dataset['genres_list'].apply(lambda row: [1 if genre in row else 0 for genre in all_shingles])

In [None]:
# Do a random check
print(all_shingles)
print(dataset['genres_sparse'].iloc[100])
print(dataset['genres_list'].iloc[100])

{'Documentary', 'History', 'Family', 'Thriller', 'Short', 'Adventure', 'Music', 'Animation', 'Crime', 'Talk-Show', 'War', 'Sport', 'Film-Noir', 'News', 'Western', 'Reality-TV', 'Action', 'Romance', 'Horror', 'Sci-Fi', 'Drama', 'Comedy', 'Musical', 'Mystery', 'Biography', 'Fantasy'}
[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Action', 'Crime', 'Thriller']


In [None]:
# create an empty vector full of zeros and the same length as all_shingles


In [153]:
# function to compute the Jaccard similarity between users
def J_similarity(user_id, buckets):
    # Define a set containing all the buckets the user appeared in
    user_bucket = set(buckets[user_id])

    # Create the similarity set
    similarities = {}

    # Iterate over all other users and calculate the similarity between their corresponding buckets and the user_bucket
    for other_user_id, other_user_bucket in buckets.items():
        # Consider users other than user_id
        if other_user_id != user_id:
            # Calculate the size of the intersection and union sets between the given user and every other users
            intersection_size = len(user_bucket.intersection(other_user_bucket))
            union_size = len(user_bucket.union(other_user_bucket))

            # Jaccard similarity of two sets is their intersection divided by their union
            similarity = intersection_size / union_size if union_size > 0 else 0

            similarities[other_user_id] = similarity

    return similarities

In [None]:
def create_hash_func(size: int):
    # function for creating the hash vector/function
    hash_ex = list(range(1, len(all_shingles)+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(shingles_size: int, nbits: int):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(shingles_size))
    return hashes

# we create 20 minhash vectors
minhash_func = build_minhash_func(len(all_shingles), 20)

In [None]:
minhash_func[0]

[25,
 16,
 1,
 7,
 8,
 9,
 24,
 6,
 20,
 5,
 13,
 4,
 22,
 23,
 12,
 21,
 26,
 10,
 14,
 17,
 3,
 19,
 15,
 2,
 18,
 11]

In [None]:
def create_hash(vector: list):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(all_shingles)+1):
            idx = func.index(i)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(idx)
                break
    return signature

In [None]:
dataset['signature'] = dataset['genres_sparse'].apply(lambda genre_sparse: create_hash(genre_sparse))

In [None]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id,genres_list,genres_sparse,signature
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe,"[Comedy, Drama, Romance]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[20, 20, 10, 9, 20, 20, 9, 10, 9, 10, 9, 20, 1..."
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510,"[Fantasy, Horror, Mystery, Thriller]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ...","[11, 1, 1, 14, 14, 14, 14, 1, 11, 11, 14, 14, ..."
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf,"[Action, Thriller]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[14, 16, 16, 14, 14, 14, 14, 16, 16, 16, 14, 1..."
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6,"[Action, Drama]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[9, 16, 16, 9, 16, 9, 9, 9, 9, 16, 9, 16, 9, 1..."
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287,"[Animation, Action, Adventure, Comedy, Family,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, ...","[20, 17, 12, 17, 17, 20, 20, 16, 11, 8, 17, 20..."


In [None]:
# Do a random check
print(all_shingles)
print(dataset['genres_sparse'].iloc[4])
print(dataset['genres_list'].iloc[4])
print(dataset['signature'].iloc[4])

{'Musical', 'Mystery', 'Sport', 'Film-Noir', 'Crime', 'Documentary', 'History', 'Short', 'Animation', 'Drama', 'Romance', 'Fantasy', 'Family', 'Biography', 'Thriller', 'War', 'Action', 'Adventure', 'Music', 'Talk-Show', 'Comedy', 'Horror', 'Western', 'Reality-TV', 'Sci-Fi', 'News'}
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0]
['Animation', 'Action', 'Adventure', 'Comedy', 'Family', 'Fantasy']
[20, 17, 12, 17, 17, 20, 20, 16, 11, 8, 17, 20, 11, 8, 12, 8, 20, 20, 17, 16]


In [None]:
def create_hash_2(genre_list: list):

    # Create list of size 12 and inf as the default value
    similarity_signature = [float("inf")]*12

    # Iterate through every element of the genre list
    for row_index, genre in enumerate(genre_list):

        # Calculate the hash values of the current genre and skip otherwise
        if genre == 1:

            # Retrieve hash values of current genre based on the row index
            hashes_result = build_minhash_func(len(all_shingles), 1)

            # Only update the similarity signature if a new hash value is smaller then the current one
            for i in range(0, 12):
                similarity_signature[i] = min(similarity_signature[i], hashes_result[i])

    return similarity_signature

In [None]:
dataset['signature_2'] = dataset['genres_sparse'].apply(lambda genre_sparse: create_hash_2(genre_sparse))

OverflowError: ignored

In [None]:
dataset_minhash = dataset.groupby('user_id')['genres'].agg(lambda x: ', '.join(x)).reset_index()
dataset_minhash['genres'] = dataset_minhash['genres'].apply(lambda x: set(x.split(', ')))
dataset_minhash.head()

Unnamed: 0,user_id,genres
0,00004e2862,"{Thriller, Drama, Crime}"
1,000052a0a0,"{Sport, Animation, Mystery, Music, Comedy, Dra..."
2,000090e7c8,"{Thriller, Sci-Fi, Mystery}"
3,000118a755,{Horror}
4,000296842d,"{Thriller, Drama, Sci-Fi, Mystery}"


In [None]:
# Step 2: Define Hash Functions
def hash_function(x, prime):
    return hash(x) % prime

# Step 3: Create MinHash Signatures
def minhash_signature(genre_set, num_hashes, prime):
    signatures = []
    for i in range(num_hashes):
        hash_values = [hash_function(genre, prime) for genre in genre_set]
        min_hash_value = min(hash_values)
        signatures.append(min_hash_value)
    return signatures

# Step 4: Bucketing Users
def bucket_users(users, num_buckets):
    buckets = [[] for _ in range(num_buckets)]
    for i, user in enumerate(users):
        for bucket_index in user['buckets']:
            buckets[bucket_index].append(user['user_id'])
    return buckets

# Set parameters
num_hashes = 50  # Number of hash functions
prime = 31  # A prime number for the hash function
num_buckets = 10  # Number of buckets

# Apply MinHash to each user
dataset_minhash['signatures'] = dataset_minhash['genres'].apply(lambda x: minhash_signature(x, num_hashes, prime))

# Assign users to buckets based on their signatures
dataset_minhash['buckets'] = dataset_minhash['signatures'].apply(lambda x: [hash_function(hash_val, num_buckets) for hash_val in x])

# Bucketing Users
buckets = bucket_users(dataset_minhash.to_dict('records'), num_buckets)

# Print the buckets
for i, bucket in enumerate(buckets):
    print(f'Bucket {i}: {bucket}')

Bucket 0: ['16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '16082886a5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8d5', '7ee2bab8

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

