In [1]:
from tqdm import tqdm

import pandas as pd
import numpy as np
import random

In [151]:
#Reading the initial dataframe
netflix_df = pd.read_csv(r'C:\Users\ahmad\Desktop\sap\adm\HW4\archive (1)\vodclickstream_uk_movies_03.csv')



In [152]:
# a copy of the dataframe so i dont overwrite
succ_movies = netflix_df.copy()
succ_movies.drop(['movie_id', 'release_date','datetime'], axis = 1)
succ_movies = succ_movies.sort_values(["user_id", "movie_id", "datetime", "duration"])


In [153]:
# calculating number of clicks for every user for every movie
#RQ 1,1
user_movie_clicks = succ_movies.groupby(['user_id', 'movie_id', 'title','genres']).size().reset_index(name='clicks')



In [154]:
#Filtering the top movies per user for genres column, to perform hashing
import re
top_movies_per_user = user_movie_clicks.sort_values('clicks', ascending=False).groupby('user_id').head(10)

top_movies_per_user.sort_values(by = ['user_id','clicks'], ascending = False)
top_movies_per_user['filtered_genres'] = top_movies_per_user['genres'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

top_movies_per_user.drop_duplicates(subset='user_id', keep='first', inplace=True)



In [155]:
def hashing(nums, a=31):
    hash_val = 0
    
    for num in nums:
        
        for char in str(num):
            #  'hash_val * a' scales the current hash
            # ord converts string into ascii value
            hash_val = (hash_val * a + ord(char)) 
    return hash_val


In [158]:
#considering the single genre itself as a shingle, sufficient enough for our engine
top_movies_per_user['shingles'] = top_movies_per_user['filtered_genres'].str.split()



In [15]:
#creating a list of unique genres from our dataframe
genres_list = top_movies_per_user['filtered_genres'].to_list()
genres_set = set()

for gl in genres_list:
    genres = gl.split()
    genres_set.update(genres)

genres_set = list(genres_set)
    

In [16]:

#creating a unique list of user ids

unique_users_list = top_movies_per_user['user_id'].unique().tolist()

#Initializing our characteristic matrix with zeros
char_matrix = pd.DataFrame(0, index=unique_users_list, columns=list(genres_set))

#the column indexes will be genres and for the rows it will be the user ids
#putting 1 where we have a user with a  certain genre
for index, row in tqdm(top_movies_per_user.iterrows(), total=len(top_movies_per_user), desc="Filling Signature Matrix"):
    user_id = row['user_id']
    genres = row['filtered_genres'].split()
    
    for genre in genres:
        value_to_set = 1
        char_matrix.at[user_id, genre] = value_to_set

Filling Signature Matrix: 100%|██████████| 161918/161918 [00:19<00:00, 8490.47it/s]


After creating our characteristic matrix we have to create our signature matrix using minhashing.

In [1]:
#Creating a minhash function to perform it on our characteristic matrix
def min_hashing(char_matrix, hashing, num_hashes):
    # initialize the signature matrix with infinity
    signature_matrix = pd.DataFrame(np.inf, index=char_matrix.index, columns=range(num_hashes))

    # For each hash function, compute the min hash across all genres for each user
    for user in tqdm(char_matrix.index, desc="Computing MinHash for each user"):
        for i in range(num_hashes):
            #computing the minimum hash value for the user and current hash function 
            #if the user has a mark for 1 in the characteristic matrix, calculate hash and track minimum
            min_hash = min([hashing(genre) * (i + 1) % 1000000 for genre in char_matrix.columns if char_matrix.at[user, genre] == 1])
            #we have to ensure a unique hash for every function 
            signature_matrix.at[user, i] = min_hash
            #update minimum hash value for this user and hash function 
    return signature_matrix


In [19]:
num_hashes = 30  #arbitrary number of hashes
signature_matrix = min_hashing(char_matrix, hashing,num_hashes)


Computing MinHash for each user: 100%|██████████| 161918/161918 [12:20<00:00, 218.77it/s]


In [24]:
signature_matrix
#column  indicates hash function index 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
7cdfd0e14a,543078.0,86156.0,534901.0,172312.0,224835.0,69802.0,424537.0,342328.0,260119.0,177910.0,...,273611.0,191402.0,109193.0,26984.0,124175.0,120028.0,663106.0,206184.0,504043.0,292340.0
23c52f9b50,144281.0,86156.0,432843.0,172312.0,715390.0,83744.0,9967.0,154248.0,298529.0,430780.0,...,29901.0,174182.0,154352.0,33872.0,515600.0,120028.0,663106.0,39868.0,184149.0,292340.0
59416738c3,144281.0,288562.0,432843.0,577124.0,588955.0,506746.0,9967.0,154248.0,260119.0,177910.0,...,29901.0,174182.0,109193.0,26984.0,607025.0,751306.0,780357.0,39868.0,184149.0,328430.0
e06f0be797,16438.0,32876.0,49314.0,65752.0,82190.0,98628.0,9967.0,96640.0,147942.0,164380.0,...,29901.0,15760.0,318463.0,289920.0,228450.0,157588.0,86726.0,15864.0,184149.0,112400.0
49d091aa63,144281.0,288562.0,432843.0,577124.0,259140.0,506746.0,9967.0,154248.0,260119.0,177910.0,...,29901.0,174182.0,109193.0,26984.0,295700.0,751306.0,599356.0,39868.0,184149.0,328430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5fdd93fc0c,180624.0,274160.0,534901.0,379868.0,185400.0,69802.0,264368.0,96640.0,260119.0,177910.0,...,273611.0,15760.0,109193.0,26984.0,124175.0,564080.0,201160.0,57472.0,238096.0,112400.0
5fdd462bd5,637080.0,274160.0,534901.0,379868.0,185400.0,69802.0,424537.0,96640.0,260119.0,177910.0,...,273611.0,15760.0,109193.0,26984.0,124175.0,564080.0,201160.0,659076.0,475320.0,112400.0
5fdc850e43,106532.0,86156.0,319596.0,172312.0,224835.0,69802.0,424537.0,342328.0,260119.0,65320.0,...,237172.0,191402.0,109193.0,26984.0,124175.0,120028.0,663106.0,206184.0,89428.0,195960.0
5fdc7f6d2f,258396.0,274160.0,355454.0,33584.0,185400.0,69802.0,162726.0,67168.0,66362.0,370800.0,...,378680.0,15760.0,391814.0,201504.0,124175.0,564080.0,199086.0,235088.0,102722.0,112400.0


In [125]:
def divide_into_bands(signature_matrix, bands):
    rows_per_band = signature_matrix.shape[1] // bands
    # calculating the number of rows per band
    #returning a list to store each band
   
    #we will be slicing horizontally
    return [signature_matrix.iloc[:, i:i + rows_per_band] for i in range(0, signature_matrix.shape[1], rows_per_band)]
 #stepping by the number of rows per band

bands = divide_into_bands(signature_matrix, bands=10)

In [126]:
def hash_band(band):
    #converting every row into a tuple 
    return band.apply(lambda x: hashing(tuple(x)), axis=1)

buckets = [hash_band(band) for band in bands]

In [129]:
def create_and_process_buckets(bands, hashing):
    final_buckets = {}
    for band in bands:
        band_hashes = hash_band(band)
        #hashing every band
        for index, hash_value in band_hashes.items():
            #check if the hash value already exists in the bucket
            if hash_value not in final_buckets:
                
                final_buckets[hash_value] = set()
                #add current index into the set of indixes corresponding to said hash value 
            final_buckets[hash_value].add(index)
    return final_buckets


In [159]:
all_buckets = create_and_process_buckets(bands, hashing)



In [146]:
def similar_users(all_buckets, user_id):
    similar_users = {}
    total_bucket_counts = {}

    #  count the total buckets for each user and find buckets containing the given user
    for bucket in all_buckets.values():
        for user in bucket:
            total_bucket_counts[user] = total_bucket_counts.get(user, 0) + 1
        #check if user id already exists in said bucket 
        if user_id in bucket:
            #Add other users in the same bucket to the similar users dictionary
            for other_user in bucket:
                if other_user != user_id:
                    #to make sure we dont put the same user twice
                    if other_user not in similar_users:
                        similar_users[other_user] = 0
                    similar_users[other_user] += 1

    # sort similar users based on frequency and then on total number of buckets it is in
    sorted_similar_users = sorted(similar_users.items(), key=lambda x: (-x[1], total_bucket_counts[x[0]]))

    # return the top two most similar users
    return [user[0] for user in sorted_similar_users[:2]]
    #similarity is based on their coexistence in the same bucket. if more than 1 hash function has hashed the same users into the same bucket, the users will be similar. the more hash functions that have assigned the users into the similar buckets, the more we will consider them as similar
    # if two users have the same similarity score, the function then considers the total number of buckets each user appears in 
most_similar_users = similar_users(all_buckets, '159dd5e534')
most_similar_users



['d723737981', '0561f5afd1']

In [147]:
sim_user1, sim_user2 = most_similar_users



In [148]:
data_u1 = user_movie_clicks[user_movie_clicks['user_id'] == sim_user1]

Unnamed: 0,user_id,movie_id,title,genres,clicks
422194,d723737981,6ba9be3429,Apollo 13,"Adventure, Drama, History",1


In [149]:
data_u2 = user_movie_clicks[user_movie_clicks['user_id'] == sim_user2]


Unnamed: 0,user_id,movie_id,title,genres,clicks
10211,0561f5afd1,3ad6a1290a,Goodfellas,"Biography, Crime, Drama",1
10212,0561f5afd1,6ba9be3429,Apollo 13,"Adventure, Drama, History",2


In [150]:
data_u1['clicks'] = pd.to_numeric(data_u1['clicks'], errors='coerce')
data_u2['clicks'] = pd.to_numeric(data_u2['clicks'], errors='coerce')

data_u1 = data_u1.sort_values(by='clicks', ascending=False)
data_u2 = data_u2.sort_values(by='clicks', ascending=False)

u1_movies = set(data_u1['title'])
u2_movies = set(data_u2['title'])

common_movies = u1_movies.intersection(u2_movies)
top_movie_u1 = data_u1.iloc[0]['title'] if len(data_u1) > 0 else None
top_movie_u2 = data_u2.iloc[0]['title'] if len(data_u2) > 0 else None

# adjust top movies if they are in common movies
if top_movie_u1 in common_movies:
    top_movie_u1 = data_u1.iloc[1]['title'] if len(data_u1) > 1 else None
if top_movie_u2 in common_movies:
    top_movie_u2 = data_u2.iloc[1]['title'] if len(data_u2) > 1 else None

# update common movies set
common_movies.update([top_movie_u1, top_movie_u2])
if None in common_movies:
    common_movies.remove(None)
print(common_movies)

{'Goodfellas', 'Apollo 13'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_u1['clicks'] = pd.to_numeric(data_u1['clicks'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_u2['clicks'] = pd.to_numeric(data_u2['clicks'], errors='coerce')
