# BLU10 - Exercises Notebook

In [1]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd

import scipy.sparse
from scipy.sparse import random, coo_matrix, lil_matrix, dok_matrix, csr_matrix, csc_matrix

from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

## Q0: Create the ratings matrix (ungraded)

In [2]:
def read_data():
    
    path = os.path.join('data', 'ml-latest-small', 'ratings.csv')
    data = np.genfromtxt(path, delimiter=',', skip_header=1, usecols=[0, 1, 2])
    return data


data = read_data()
data

array([[1.000e+00, 3.100e+01, 2.500e+00],
       [1.000e+00, 1.029e+03, 3.000e+00],
       [1.000e+00, 1.061e+03, 3.000e+00],
       ...,
       [6.710e+02, 6.365e+03, 4.000e+00],
       [6.710e+02, 6.385e+03, 2.500e+00],
       [6.710e+02, 6.565e+03, 3.500e+00]])

In [3]:
def make_ratings(data):
    
    users, user_pos = np.unique(data[:, 0], return_inverse=True)
    items, item_pos = np.unique(data[:, 1], return_inverse=True)
    
    R = np.zeros((len(users), len(items)))
    R[user_pos, item_pos] = data[:, 2]
    
    return R


R = make_ratings(data)
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [4]:
f"We have {R.shape[0]} user and {R.shape[1]} items."

'We have 671 user and 9066 items.'

In [5]:
expected_hash = '3e68e59862d4b598351ba37b6103d22d125e0ec19f8c91a97188ffc78e156137'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

## Q1: Convert the Ratings Matrix to a Sparse Representation

In [6]:
def get_csr(R):
    """
    Parameters
        R - Ratings Matrix
    
    Returns
        H_ - Compressed Sparse Row Matrix
    """
    H_ = csr_matrix(R)
    return H_
    # YOUR CODE HERE
    
    
H_ = get_csr(R)

In [7]:
expected_hash = '1d6952c195999c1f9de9e2e0f47e3c748cd784aa9578645e656f34ec6d546170'
assert hashlib.sha256(str(H_).encode()).hexdigest() == expected_hash

## Q2: What is the space efficiency of converting to sparse
In percentage, rounded up to 2 decimal places.

In [8]:
def get_sparsity_score(R):
    """
    Parameters
        R - Ratings Matrix
        
    Returns:
        sparsity_score - (float) Sparsity Score of R. In percentage, rounded up to 2 decimal places.
    """
    # YOUR CODE HERE
    return np.round(R[R.nonzero()].size*100 / R.size,2)
    
sparsity_score = get_sparsity_score(R)
sparsity_score

1.64

In [9]:
expected_hash = '41a05f83a0b06fc85074bed9caee5fcf753ffde064f211c8ff11f1a29edd2b72'
assert hashlib.sha256(str(sparsity_score).encode()).hexdigest() == expected_hash

In [10]:
f"The Sparsity Score is {sparsity_score}%."

'The Sparsity Score is 1.64%.'

In [11]:
def get_sparsity_representation_savings(H_, R):
    """
    Parameters
        H_ - CSR Matrix
        R  - Original ratings matrix
        
    Returns
        savings - (float) percentage of CSR size in relation to original ratings matrix.
    """
    # YOUR CODE HERE
    return np.round(H_.size*100/R.size,2)

sparsity_representation_savings = get_sparsity_representation_savings(H_, R)
sparsity_representation_savings

1.64

In [12]:
expected_hash = '41a05f83a0b06fc85074bed9caee5fcf753ffde064f211c8ff11f1a29edd2b72'
assert hashlib.sha256(str(sparsity_representation_savings).encode()).hexdigest() == expected_hash

In [13]:
f"The Sparse Representation size is {sparsity_representation_savings}% of the original matrix."

'The Sparse Representation size is 1.64% of the original matrix.'

In [14]:
# Just to make sure that the dense representation of the sparse matches the original ratings matrix
assert H_.todense().shape == R.shape

## Q3: Popular Items - What are the Top-3 Most Rated items?
More ratings give us the current trends but not necessarily the best suggestions.

In [15]:
def top3items(R, n=3):
    """
    Parameters
        R - Ratings parameter
        n - Number of Top-n items to retrieve
        
    Returns
        most_rated - (numpy.ndarray) array for top-n most rated items
    """
    # YOUR CODE HERE
    R_ = np.greater(R, 0).sum(axis=0)
    return np.negative(R_).argsort()[:n]

most_wanted = top3items(R)

In [16]:
expected_hash = '41f87af66749260f2d7a04703f1069a37e58135d2e9f43e4c1f94e026cff1117'
assert hashlib.sha256(str(most_wanted).encode()).hexdigest() == expected_hash

## Q4: Influencers - What are the Top-5 Most Active Users?

In [17]:
def get_influencers(R, n=5):
    """
    Parameters
        R - Ratings parameter
        n - number of top-n most active users
        
    Returns
        influencers - (numpy.ndarray) array for top-n most active users
    """
    # YOUR CODE HERE
    R_ = np.greater(R, 0).sum(axis=1)
    return  np.negative(R_).argsort()[:n]
influencers = get_influencers(R)

In [18]:
expected_hash = 'e2440e4dd3a85b37a5401b3d4e5c437aaa26de0620f19178bc782c82dd4c3432'
assert hashlib.sha256(str(influencers).encode()).hexdigest() == expected_hash

## Q5: Elite - What are the Top-7 Better Rated Items?
Since this can be biased by a low number of ratings, we need items to have at least 10 ratings.

In [28]:
def remove_zeros(R):
    R_ = R.copy()
    R_[R_ == 0] = np.NaN
    
    return R_

def elite(R, n=7, k=10):
    """
    Parameters
        R - Ratings Matrix
        n - Top-n items
        k - Mininum number of ratings
        
    Returns
        best_items - (numpy.ndarray) array for top-n best mean rated items 
    """
    
    # YOUR CODE HERE
    R_ = np.greater(R, 0).sum(axis=0)
    R_index = np.greater_equal(R_, k)
    R_bigger_10 = R.copy()
    R_bigger_10[:, ~R_index] = np.NaN
    
    R_bigger_10 = remove_zeros(R_bigger_10)
    R_bigger_10_mean = np.nanmean(R_bigger_10, axis=0)
    #print(np.negative(R_bigger_10_mean))
    return np.negative(R_bigger_10_mean).argsort()[:n]
    
best_items = elite(R)
best_items

[-3.87246964 -3.40186916 -3.16101695 ...         nan         nan
         nan]




array([1501, 2770,  695,  284, 1510, 5328, 1507])

In [27]:
expected_hash = '25af7ec79f15bf252f96440f68995619f7b85bdfc1f89d69a227cd9d4cf80e7d'
assert hashlib.sha256(str(best_items).encode()).hexdigest() == expected_hash

AssertionError: 

## Q6: Apriori - What are the 5 most common 3-piece itemsets?
We define "common itemsets" as at least 3 different items that are usually bought together at least by 20% of the population.
Show your results sorted by support in descending way.

In [63]:
def getBundlesSolution(R, n=None, min_support=None, top=None):
    """
    Parameters
        R: Ratings Matrix
        n: Number of items in commonset
        min_support: Minimum percentage of users that contains the itemset
        top: Number of most common itemsets
        
    Return
        df: the return dataframe should have two columns ["support", "itemsets"],
            with the support percentage and the itemsets.
    """
    R_ = pd.DataFrame(R > 0)
    df = apriori(R_, min_support)
    df = df[df.apply(lambda x: len(x.itemsets)>=n,axis=1)]
    return df.sort_values('support',ascending = False).iloc[:top]

df = getBundlesSolution(R, n=3, min_support=0.2, top=5)
df

Unnamed: 0,support,itemsets
251,0.271237,"(321, 266, 525)"
244,0.254844,"(321, 266, 284)"
263,0.253353,"(321, 427, 525)"
248,0.251863,"(321, 266, 427)"
256,0.248882,"(321, 284, 525)"


In [64]:
expected_hash = 'f7441550a0ca5274581d023417c99540e3a8a4cca68824a87cbe6d95c07742ea'
assert hashlib.sha256(str(df.shape).encode()).hexdigest() == expected_hash

expected_hash = '62572327fb0a94f45a9bc8604ccb94f8925a6883b7fb5cd4786c736824ccb8b8'
assert hashlib.sha256(str(df.iloc[0,1]).encode()).hexdigest() == expected_hash

np.testing.assert_approx_equal(df.iloc[2, 0], 0.253353, significant=4)