## Chapter 15 Association Rules and Collaborative Filtering

**Original Code Credit:**: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.

*Modifications* have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.

### Import Libraries

In [21]:
import os
import heapq
import random
from collections import defaultdict
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori, association_rules
import surprise
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

### 15.1 Association Rules

#### Example 1: Synthetic Data on Purchases of Phone Faceplates

In [4]:
# Load and preprocess data set 
fp_df = pd.read_csv(os.path.join('..', 'data', 'Faceplate.csv'))
fp_df.set_index('Transaction', inplace=True)
fp_df = fp_df.astype(bool, 0)

# create frequent itemsets
itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)

# convert into rules
rules = association_rules(itemsets, num_itemsets=len(fp_df), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(6)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
12,"(White, Red)",(Green),0.2,0.5,2.5,0.12
15,(Green),"(White, Red)",0.2,1.0,2.5,0.12
4,(Green),(Red),0.2,1.0,1.666667,0.08
13,"(White, Green)",(Red),0.2,1.0,1.666667,0.08
7,(Orange),(White),0.2,1.0,1.428571,0.06
8,(Green),(White),0.2,1.0,1.428571,0.06


#### Example 2: Rules for Similar Book Purchases

In [6]:
# load dataset
all_books_df = pd.read_csv(os.path.join('..', 'data', 'CharlesBookClub.csv'))
ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',
          'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']
count_books = all_books_df.drop(columns=ignore)
count_books[count_books > 0] = 1
count_books = count_books.astype(bool, 0)
# create frequent itemsets and rules
itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)
rules = association_rules(itemsets, num_itemsets=len(count_books), metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift','leverage']].sort_values(by=['lift'], ascending=False).head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage
64,"(RefBks, YouthBks)","(ChildBks, CookBks)",0.05525,0.68,2.809917,0.035588
73,"(DoItYBks, RefBks)","(ChildBks, CookBks)",0.06125,0.662162,2.736207,0.038865
60,"(DoItYBks, YouthBks)","(ChildBks, CookBks)",0.067,0.64891,2.681448,0.042014
80,"(RefBks, GeogBks)","(ChildBks, CookBks)",0.05025,0.614679,2.539995,0.030467
69,"(GeogBks, YouthBks)","(ChildBks, CookBks)",0.06325,0.605263,2.501087,0.037961
77,"(DoItYBks, GeogBks)","(ChildBks, CookBks)",0.0605,0.59901,2.475248,0.036058
67,"(GeogBks, ChildBks, CookBks)",(YouthBks),0.06325,0.577626,2.424452,0.037162
72,"(RefBks, ChildBks, CookBks)",(DoItYBks),0.06125,0.591787,2.323013,0.034883
48,"(DoItYBks, GeogBks)",(YouthBks),0.0545,0.539604,2.264864,0.030437
63,"(RefBks, ChildBks, CookBks)",(YouthBks),0.05525,0.533816,2.240573,0.030591


### 15.2 Collaborative Filtering

#### Example 3: Netflix Prize Contest

In [7]:
random.seed(0)
nratings = 5000
randomData = pd.DataFrame({
    'itemID': [random.randint(0,99) for _ in range(nratings)],
    'userID': [random.randint(0,999) for _ in range(nratings)],
    'rating': [random.randint(1,5) for _ in range(nratings)]
})
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    byUser = defaultdict(list)
    for p in predictions:
        byUser[p.uid].append(p)
    
    # For each user, reduce predictions to top-n
    for uid, userPredictions in byUser.items():
        byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)
    return byUser

In [8]:
# Convert the data set into the format required by the surprise package
# The columns must correspond to user id, item id, and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)
# Split into training and test set
trainset, testset = train_test_split(data, test_size=.25, random_state=1)
## User-based filtering
# compute cosine similarity between users 
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset) 
# Print the recommended items for each user
top_n = get_top_n(predictions, n=4)
print('Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:5]:
    print('User {}'.format(uid))
    for prediction in user_ratings:
        print('  Item {0.iid} ({0.est:.2f})'.format(prediction), end='')
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Top-4 recommended items for each user
User 6
  Item 6 (5.00)  Item 77 (2.50)  Item 60 (1.00)
User 222
  Item 77 (3.50)  Item 75 (2.78)
User 424
  Item 14 (3.50)  Item 45 (3.10)  Item 54 (2.34)
User 87
  Item 27 (3.00)  Item 54 (3.00)  Item 82 (3.00)  Item 32 (1.00)
User 121
  Item 98 (3.48)  Item 32 (2.83)


In [9]:
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
# Predict rating for user 383 and item 7
algo.predict(383, 7)

Computing the cosine similarity matrix...
Done computing similarity matrix.


Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})

#### Example 4: Predicting Movie Ratings with MovieLens Data

In [18]:
# download MovieLens data
ratings = pd.read_csv(os.path.join('..', 'data', 'MovieLensRatings.csv'))
movies = pd.read_csv(os.path.join('..', 'data', 'MovieLensMovies.csv'))
# create a dictionary of movie titles by movieId
moviesById = {movie['movieId']: movie['title'] for _, movie in movies.iterrows()}

# convert ratings into dataset suitable for scikit-surprise and split into
# training and holdout sets
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, holdoutset = train_test_split(data, test_size=0.2, random_state=1)

In [19]:
# UBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': True}
ubcf = KNNBasic(sim_options=sim_options)
ubcf.fit(trainset)

# predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = ubcf.test(holdoutset)
top_n = get_top_n(predictions, n=4)
print('UBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
 print(f'User {uid}')
 for prediction in user_ratings:
     print(f'  Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
           f' {moviesById[prediction.iid]:40s}')

# IBCF model and prediction
sim_options = {'name': 'cosine', 'user_based': False}
ibcf = KNNBasic(sim_options=sim_options)
ibcf.fit(trainset)

# predictions
top_n = get_top_n(ibcf.test(holdoutset), n=4)
print('IBCF Top-4 recommended items for each user')
for uid, user_ratings in list(top_n.items())[:3]:
 print(f'User {uid}')
 for prediction in user_ratings:
     print(f'  Est. {prediction.est:.2f} / act. {prediction.r_ui}:',
           f' {moviesById[prediction.iid]:40s}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
UBCF Top-4 recommended items for each user
User 469
  Est. 4.45 / act. 4.0:  Usual Suspects, The (1995)              
  Est. 4.36 / act. 5.0:  Fargo (1996)                            
  Est. 4.36 / act. 3.0:  Star Wars: Episode VI - Return of the Jedi (1983)
  Est. 4.33 / act. 5.0:  Harold and Maude (1971)                 
User 187
  Est. 4.39 / act. 4.5:  Taxi Driver (1976)                      
  Est. 4.34 / act. 3.0:  Fight Club (1999)                       
  Est. 4.33 / act. 5.0:  Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
  Est. 4.32 / act. 4.5:  Aliens (1986)                           
User 399
  Est. 4.24 / act. 5.0:  Forrest Gump (1994)                     
  Est. 4.18 / act. 5.0:  Lord of the Rings: The Two Towers, The (2002)
  Est. 4.11 / act. 5.0:  Back to the Future (1985)               
  Est. 4.07 / act. 2.5:  Terminator 2: Judgment Day (1991)       
Computing the cosine similarity matrix...
D

In [22]:
ubcf_pred = ubcf.test(holdoutset)
ibcf_pred = ibcf.test(holdoutset)
random_pred = [surprise.Prediction(0, 0, random.randint(1, 5),
                                random.randint(1, 5), None)
            for _ in range(len(holdoutset))]

pd.DataFrame({
 'UBCF': {'RMSE': surprise.accuracy.rmse(ubcf_pred, verbose=False),
          'MSE': surprise.accuracy.mse(ubcf_pred, verbose=False),
          'MAE': surprise.accuracy.mae(ubcf_pred, verbose=False)},
 'IBCF': {'RMSE': surprise.accuracy.rmse(ibcf_pred, verbose=False),
          'MSE': surprise.accuracy.mse(ibcf_pred, verbose=False),
          'MAE': surprise.accuracy.mae(ibcf_pred, verbose=False)},
 'Random': {'RMSE': surprise.accuracy.rmse(random_pred, verbose=False),
            'MSE': surprise.accuracy.mse(random_pred, verbose=False),
            'MAE': surprise.accuracy.mae(random_pred, verbose=False)},
}).transpose().round(3)

Unnamed: 0,RMSE,MSE,MAE
UBCF,0.973,0.948,0.75
IBCF,0.978,0.957,0.762
Random,2.008,4.033,1.608
