# Data Mining – Assignment 1
---
> Chalkiopoulos Georgios, Electrical and Computer Engineer NTUA <br />
> Data Science postgraduate Student <br />
> gchalkiopoulos@aueb.gr

# Imports

In [1]:
import os
from pandas import DataFrame
from typing import TextIO

from helper_functions import *

In [2]:
try:
    os.mkdir("files")
except FileExistsError:
    pass

# 1) Import and pre-process the dataset with users

There are 3 files for the dataset,
* the users.txt file contains id, age, gender, occupation and postcode separated by |,
* the movies.txt file contains id, title (with release year) and some other information not related with the assignment separated by |,
* the ratings.txt file (tab separated) which contains userid, movieid, rating (1-5) and timestamp.

For this assignment only the set of movies that a user has rated, and not the ratings, will be used. In your report you should describe in detail any processing and conversion you made to the original data and the reasons it was necessary.

In [3]:
# Define path
path = Path.cwd() / "MovieLensDataset"

In [4]:
# Users
users_col: list = ["id", "age", "gender", "occupation", "postcode"]
users_df: DataFrame = pd.read_csv(path / "users.txt", delimiter="|",names=users_col, header=None)
users_df.head()

Unnamed: 0,id,age,gender,occupation,postcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
# Movies df
movies_col: list = ["id", "title_year"]
movies_df: DataFrame = pd.read_csv(path / "movies.txt", delimiter="|", encoding='latin-1', header=None, names=movies_col, usecols=movies_col)
movies_df.head()

Unnamed: 0,id,title_year
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
# Ratings df
ratings_col: list = ["userid", "movieid", "rating", "timestamp"]
ratings_df: DataFrame = pd.read_csv(path / "ratings.txt", delimiter="\t", header=None, names=ratings_col)

print(f"Distinct users: {ratings_df['userid'].unique().shape[0]}")
print(f"Distinct Movies: {ratings_df['movieid'].unique().shape[0]}")
print("\nRatings of userid #1:")
ratings_df.loc[ratings_df.userid == 1]

Distinct users: 943
Distinct Movies: 1682

Ratings of userid #1:


Unnamed: 0,userid,movieid,rating,timestamp
202,1,61,4,878542420
305,1,189,3,888732928
333,1,33,4,878542699
334,1,160,4,875072547
478,1,20,4,887431883
...,...,...,...,...
92049,1,28,4,875072173
92487,1,172,5,874965478
94019,1,122,3,875241498
96699,1,152,5,878542589


# 2) Compute exact Jaccard similarity of users

* As a first step we will create a set, for each user, containing the movies he has rated. We will combine all the users in a DataGrame called user_movies

In [7]:
# Create list of movies per user
user_movies: DataFrame =  ratings_df[["userid", "movieid"]].groupby('userid').apply(lambda x: set([i for i in x['movieid']]))
user_movies.head()

userid
1    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
2    {257, 258, 1, 10, 13, 14, 269, 272, 273, 274, ...
3    {258, 260, 264, 268, 271, 272, 288, 294, 299, ...
4    {258, 260, 264, 11, 271, 288, 294, 300, 301, 3...
5    {1, 2, 17, 21, 24, 25, 29, 40, 42, 50, 62, 63,...
dtype: object

Using this DataFrame, we will compute the jaccard scores between all possible pairs of users, and keep those whose similarity is above 50%.

In [8]:
# compute jaccard scores
jaccard_scores = compute_jaccard(data=user_movies)

In [9]:
print("Similar Pairs: ")
similar_users = print_similar_pairs(jaccard_scores)

Similar Pairs: 
Pair: (408, 898) - Jaccard Score: 83.87%
Pair: (328, 788) - Jaccard Score: 67.30%
Pair: (489, 587) - Jaccard Score: 62.99%
Pair: (600, 826) - Jaccard Score: 54.55%
Pair: (451, 489) - Jaccard Score: 53.33%
Pair: (674, 879) - Jaccard Score: 52.17%
Pair: (554, 764) - Jaccard Score: 51.70%
Pair: (197, 826) - Jaccard Score: 51.30%
Pair: (197, 600) - Jaccard Score: 50.00%
Pair: (800, 879) - Jaccard Score: 50.00%


### Output the movie titles that the most similar pair of users has seen.
The most similar pair of users are 408 and 898 with a Jaccard score of 83.87%. The movies that these users have seen is presented below:

In [10]:
# join the ratings df with the movies to get the movie names per user id
joined: DataFrame = ratings_df.loc[ratings_df.userid.isin([408, 898])].merge(movies_df, left_on="movieid", right_on="id")

print("Movies seen by users 408 and/or 898: \n")
print(f"{'id':10}Movie")
for movie in sorted(joined["title_year"].unique()):
    print(f"{str(joined.loc[joined.title_year == movie].userid.values):10}", end="")
    print(movie)

Movies seen by users 408 and/or 898: 

id        Movie
[408 898] Air Force One (1997)
[898]     Alien: Resurrection (1997)
[408 898] Apt Pupil (1998)
[898]     As Good As It Gets (1997)
[408 898] Conspiracy Theory (1997)
[408 898] Contact (1997)
[408 898] Cop Land (1997)
[898]     Deceiver (1997)
[408 898] English Patient, The (1996)
[408 898] Everyone Says I Love You (1996)
[408 898] Gattaca (1997)
[408 898] Good Will Hunting (1997)
[408 898] Indian Summer (1996)
[408 898] Jackal, The (1997)
[898]     Jungle2Jungle (1997)
[898 408] Kolya (1996)
[408 898] L.A. Confidential (1997)
[408]     Liar Liar (1997)
[408 898] Lost Highway (1997)
[408 898] Midnight in the Garden of Good and Evil (1997)
[408 898] Mouse Hunt (1997)
[408 898] Rainmaker, The (1997)
[408 898] Rocket Man (1997)
[408 898] Saint, The (1997)
[408 898] Scream (1996)
[408 898] Spawn (1997)
[408 898] Starship Troopers (1997)
[408 898] Titanic (1997)
[898 408] Tomorrow Never Dies (1997)
[408 898] U Turn (1997)
[408 898] Wag t

In [11]:
# join the ratings df with the movies to get the movie names per user id
demographic: DataFrame = ratings_df.loc[ratings_df.userid.isin([408, 898])].merge(users_df, left_on="userid", right_on="id")
demographic[["userid", "age", "gender", "occupation", "postcode"]].drop_duplicates()

Unnamed: 0,userid,age,gender,occupation,postcode
0,408,23,M,student,61755
27,898,23,M,homemaker,61755


# 3) Compute similarity using Min-hash signatures

<u>Description of hash functions</u>: use the following family of hash functions: $h_{a,b}(x)=(ax+b) mod R$, with a,b random integers in the interval (0,R) and R a large enough prime number that you may want to finetune in your initial experimentation. Make sure that each hash function uses different values of a,b pairs.

<u>Evaluation of Min-hashing</u>: Use 50, 100, and 200 hash functions. For each value, output the pair of users that have estimated similarity at least 0.5, and report the number of false positives and false negatives (against the exact Jaccard similarity) that you obtain. For the false positives and negatives, report the averages for 5 different runs using different functions. Comment on how the number of hash functions affects the false positive and negatives figures.

### Create user matrices

Before proceeding, we will create the "Document" of each user. This, in practice, is a list in which we assign the number 1 if the user has seen the movie with id i and 0 if he hasn't.

* keep in mind that the movie id's start from one while python indices start from zero

In [12]:
# Create doc
movies = user_movies.apply(lambda x: [1 if i+1 in x else 0 for i in range(ratings_df.movieid.max())])
movies.head()

userid
1    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
2    [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
5    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
dtype: object

* We need a large value of R which will make sure that we have at least 1682 different values (distinct movies). In this case we will avoid having two values for the same row (hash conflict), while hashing the rows movie ids.

In [13]:
# find the max value of the movieid
number_of_movies: int = ratings_df.movieid.max()
number_of_movies

1682

In [14]:
# Define movie index rows
rows: np.ndarray = np.array(range(1, len(movies[1])+1))

# find optimal R which avoids hash conficts
R: int = find_optimal_R(rows=rows, number_of_movies=number_of_movies)
print(f"Optimal value for R is: {R}")

Optimal value for R is: 1693


Detailed results will be saved in the estimated_similarities.csv file, and during iterations, only a summary for each run will be printed.

In [15]:
# define_writer
fw: TextIO = (Path("files") / f"estimated_similarities.csv").open(mode="w",encoding="utf8")
writer = csv.writer(fw,lineterminator="\n")
writer.writerow(["hash_functions","iteration", "user1", "user2", "estimated_similarity", "evaluation"])

69

For 50, 100 and 200 hash functions, 5 runs will be performed. At the end of the run the average number of TP, FP and FN will be printed.

In [15]:
for n in [50, 100, 200]:
    fn, fp, tp = 0, 0, 0
    for i in range(5):
        print(f"Number of Hash functions: {n}\nIteration {i+1}")

        # create signatures
        signature: pd.DataFrame = generate_signatures(n=n, movies=movies, R=R, iteration=i)

        # compute jaccard scores
        jaccard_scores_hash: Dict[int, list] = compute_jaccard_hash(data=signature, similar_users=similar_users)

        # Evalute scores
        approx_scores: dict = evaluate_jaccard(jaccard_scores=jaccard_scores_hash, similar_users=similar_users, n=n, iteration=i+1, writer=writer)

        # save scores
        fn += approx_scores.get("False Negatives", 0)
        fp += approx_scores.get("False Positives", 0)
        tp += approx_scores.get("True Positives", 0)
        print("\n")

        # save signature with 200 hash functions
        if n == 200:
            f =(Path("files") / f"signature.pkl").open('wb')
            pkl.dump(rows, f)
            f.close()


    print(f"Average for {n}: \nFalse Negatives - {fn/5} \nFalse Positives - {fp/5} \nTrue Positives - {tp/5}")
    print("\n\n")
fw.close()

Number of Hash functions: 50
Iteration 1
{'False Negatives': 2, 'False Positives': 79, 'True Positives': 8}


Number of Hash functions: 50
Iteration 2
{'False Negatives': 3, 'False Positives': 104, 'True Positives': 7}


Number of Hash functions: 50
Iteration 3
{'False Positives': 161, 'True Positives': 10}


Number of Hash functions: 50
Iteration 4
{'False Negatives': 3, 'False Positives': 108, 'True Positives': 7}


Number of Hash functions: 50
Iteration 5
{'False Negatives': 2, 'False Positives': 109, 'True Positives': 8}


Average for 50: 
False Negatives - 2.0 
False Positives - 112.2 
True Positives - 8.0



Number of Hash functions: 100
Iteration 1
{'False Positives': 20, 'True Positives': 10}


Number of Hash functions: 100
Iteration 2
{'False Negatives': 4, 'False Positives': 21, 'True Positives': 6}


Number of Hash functions: 100
Iteration 3
{'False Negatives': 2, 'False Positives': 43, 'True Positives': 8}


Number of Hash functions: 100
Iteration 4
{'False Negatives': 2, '

# 4) Locate similar users using LSH index

Using a set of 200 hash functions break up the signatures into b bands with r hash functions per band (b*r=200) and implement Locality Sensitive Hashing.
Recall that with LSH we first locate users that are similar (have the same mini-signatures) across at least one band and then assess their true similarity using their initial representations. Use the following two instances of LSH:
* LSH instance 1: b = 25, r = 8
* LSH instance 2: b = 40, r = 5

Using each instance find the pair of users with similarity at least 0.5 and report:
- The number of true pairs returned (true positives).
- The number of similarity evaluations performed using the initial representations.

* In order to run this part indepentently, we will load the signatures from the pickle file, in case the variable is not available.

In [16]:
results: dict = {"5" : [], "8": []}

for r in [5, 8]:
    tp_count: int = 0
    avg_evaluations: int = 0

    for i in range(5):

        # find candidates
        similar_users_band = calculate_similar_users_LSH(r=r, iteration=i)


        print(f"True Positive Pairs for r = {r}, iteration {i}")
        for pair in similar_users_band:

            # find candidates
            jaccard_scores_LSH = compute_jaccard(data=user_movies.loc[list(pair)])

            # for candidates with score above 50%, find true pairs
            if jaccard_scores_LSH:
                if pair in similar_users:
                    print_similar_pairs(jaccard_scores_LSH)
                    tp_count += 1
        print()

        avg_evaluations += len(similar_users_band)

    results[str(r)].append(tp_count/5)
    results[str(r)].append(avg_evaluations/5)



True Positive Pairs for r = 5, iteration 0
Pair: (328, 788) - Jaccard Score: 67.30%
Pair: (408, 898) - Jaccard Score: 83.87%
Pair: (489, 587) - Jaccard Score: 62.99%

True Positive Pairs for r = 5, iteration 1
Pair: (600, 826) - Jaccard Score: 54.55%
Pair: (328, 788) - Jaccard Score: 67.30%
Pair: (489, 451) - Jaccard Score: 53.33%
Pair: (408, 898) - Jaccard Score: 83.87%
Pair: (554, 764) - Jaccard Score: 51.70%
Pair: (674, 879) - Jaccard Score: 52.17%
Pair: (489, 587) - Jaccard Score: 62.99%
Pair: (600, 197) - Jaccard Score: 50.00%

True Positive Pairs for r = 5, iteration 2
Pair: (489, 587) - Jaccard Score: 62.99%
Pair: (328, 788) - Jaccard Score: 67.30%
Pair: (554, 764) - Jaccard Score: 51.70%
Pair: (408, 898) - Jaccard Score: 83.87%
Pair: (826, 197) - Jaccard Score: 51.30%
Pair: (489, 451) - Jaccard Score: 53.33%

True Positive Pairs for r = 5, iteration 3
Pair: (408, 898) - Jaccard Score: 83.87%
Pair: (674, 879) - Jaccard Score: 52.17%
Pair: (489, 451) - Jaccard Score: 53.33%
Pair:

In [17]:
for r, v in results.items():
    print(f"Results for r={r}:")
    print(f"\tAverage number of True Positives: {v[0]}")
    print(f"\tAverage number Evaluations: {v[1]}\n")

Results for r=5:
	Average number of True Positives: 6.0
	Average number Evaluations: 1279.4

Results for r=8:
	Average number of True Positives: 3.0
	Average number Evaluations: 38.4

