In [1]:
import collections
import csv
import itertools

import IPython
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly as py
from plotly.offline import download_plotlyjs
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

IPython.core.display.display(IPython.core.display.HTML(
    "<style>.container { width:90% !important; }</style>"))

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

py.offline.init_notebook_mode(connected=True)

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')

## Association rules

In [3]:
# parameters of the association rules
min_item_support = 0.01
min_pair_support = 0.001
min_confidence = 0.5

In [4]:
# create transaction list
# in this case a transaction is a user's every rating
transactions = set()
for _, rating_group in ratings.groupby(['user_id']):
    transactions.add(frozenset(rating_group.movie_id.tolist()))
print(len(transactions))

138493


In [5]:
# find frequent items and their frequencies
def find_frequent_items(transactions, min_support):
    item_counter = collections.Counter([item_id for transaction in transactions for item_id in transaction])
    min_count = len(transactions) * min_support
    frequent_items = {}
    for item_id, item_count in item_counter.items():
        if item_count >= min_count:
            frequent_items[item_id] = item_count
    return frequent_items
frequent_movies = find_frequent_items(transactions, min_item_support)
len(frequent_movies)

2233

In [6]:
# find frequent item pairs and their frequencies
def find_frequent_pairs(transactions, frequent_items, min_support):
    frequent_item_ids = set(frequent_items.keys())
    pair_counts = collections.defaultdict(int)
    min_count = len(transactions) * min_support
    frequent_pairs = {}
    i = 1
    for transaction in transactions:
        if i % 1000 == 0:
            if i % 10000 == 0:
                print('{0:0.1f}%'.format(100.0 * i / len(transactions)), end='')
            else:
                print('.', end='')
        i += 1
        frequent_transaction_items = transaction.intersection(frequent_item_ids)
        for pair in itertools.combinations(frequent_transaction_items, 2):
            pair_counts[pair] += 1
    for pair, pair_count in pair_counts.items():
        if pair_count >= min_count:
            frequent_pairs[pair] = pair_count
    return frequent_pairs
frequent_movie_pairs = find_frequent_pairs(transactions, frequent_movies, min_pair_support)
len(frequent_movie_pairs)

.........7.2%.........14.4%.........21.7%.........28.9%.........36.1%.........43.3%.........50.5%.........57.8%.........65.0%.........72.2%.........79.4%.........86.6%.........93.9%........

2505759

In [7]:
# calculate association rules that meet the minimum confidence criteria based on the frequent item pairs
def calculate_association_rules(frequent_items, frequent_pairs, n_transactions):
    rules = []
    for source, source_freq in frequent_items.items():
        for pair, pair_freq in frequent_pairs.items():
            if source in pair:
                target = list(set(pair).difference(set([source])))[0]
                support = 1.0 * pair_freq / n_transactions
                confidence = 1.0 * pair_freq / source_freq
                if confidence > min_confidence:
                    rules.append((source, target, support, confidence))
    return rules
rules = calculate_association_rules(frequent_movies, frequent_movie_pairs, len(transactions))
len(rules)

35723

In [8]:
# create a data frame of the rules
rules_dict = []
movie_names_map = dict(zip(movies.movie_id.tolist(), movies.title.tolist()))
for r in rules:
    rules_dict.append({
        'source_id': r[0],
        'source_name': movie_names_map[r[0]],
        'target_id': r[1],
        'target_name': movie_names_map[r[1]],
        'support': r[2],
        'confidence': r[3]})
rules_df = pd.DataFrame(rules_dict)[['source_id', 'source_name', 'target_id', 'target_name', 'support', 'confidence']]
rules_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35723 entries, 0 to 35722
Data columns (total 6 columns):
source_id      35723 non-null int64
source_name    35723 non-null object
target_id      35723 non-null int64
target_name    35723 non-null object
support        35723 non-null float64
confidence     35723 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 1.6+ MB


In [9]:
rules_df.sample(20)

Unnamed: 0,source_id,source_name,target_id,target_name,support,confidence
33460,47200,Crank (2006),1198,Raiders of the Lost Ark (I...,0.006,0.511
25706,30822,In Good Company (2004),1,Toy Story (1995),0.006,0.598
32080,5452,Look Who's Talking Now (1993),260,Star Wars: Episode IV - A ...,0.007,0.573
29710,1082,"Candidate, The (1972)",1198,Raiders of the Lost Ark (I...,0.006,0.535
4122,4239,Blow (2001),110,Braveheart (1995),0.022,0.509
34913,2644,Dracula (1931),1219,Psycho (1960),0.007,0.548
18713,69481,"Hurt Locker, The (2008)",70286,District 9 (2009),0.012,0.518
31440,6565,Seabiscuit (2003),7153,Lord of the Rings: The Ret...,0.016,0.566
21730,44022,Ice Age 2: The Meltdown (2...,47,Seven (a.k.a. Se7en) (1995),0.007,0.518
12283,8914,Primer (2004),780,Independence Day (a.k.a. I...,0.008,0.535


In [10]:
len(rules_df.source_id.unique())

2209

In [11]:
len(rules_df.target_id.unique())

472

In [12]:
rules_df.sort_values(by='support', ascending=False).head(20)

Unnamed: 0,source_id,source_name,target_id,target_name,support,confidence
1717,318,"Shawshank Redemption, The ...",296,Pulp Fiction (1994),0.21,0.573
1680,296,Pulp Fiction (1994),318,"Shawshank Redemption, The ...",0.21,0.54
1681,296,Pulp Fiction (1994),356,Forrest Gump (1994),0.21,0.539
3092,356,Forrest Gump (1994),296,Pulp Fiction (1994),0.21,0.548
3091,356,Forrest Gump (1994),318,"Shawshank Redemption, The ...",0.192,0.501
1718,318,"Shawshank Redemption, The ...",356,Forrest Gump (1994),0.192,0.523
829,480,Jurassic Park (1993),296,Pulp Fiction (1994),0.187,0.544
1964,457,"Fugitive, The (1993)",296,Pulp Fiction (1994),0.171,0.596
1966,457,"Fugitive, The (1993)",480,Jurassic Park (1993),0.169,0.589
111,589,Terminator 2: Judgment Day...,593,"Silence of the Lambs, The ...",0.166,0.554


In [13]:
rules_df.sort_values(by='confidence', ascending=False).head(20)

Unnamed: 0,source_id,source_name,target_id,target_name,support,confidence
16385,2034,"Black Hole, The (1979)",260,Star Wars: Episode IV - A ...,0.01,0.789
30091,68791,Terminator Salvation (2009),2571,"Matrix, The (1999)",0.009,0.752
22925,259,Kiss of Death (1995),296,Pulp Fiction (1994),0.009,0.745
33690,544,Striking Distance (1993),589,Terminator 2: Judgment Day...,0.009,0.744
34416,159,Clockers (1995),296,Pulp Fiction (1994),0.01,0.743
10847,94864,Prometheus (2012),2571,"Matrix, The (1999)",0.009,0.743
5925,6934,"Matrix Revolutions, The (2...",2571,"Matrix, The (1999)",0.053,0.741
14857,548,Terminal Velocity (1994),589,Terminator 2: Judgment Day...,0.012,0.737
21528,5040,Conan the Destroyer (1984),2571,"Matrix, The (1999)",0.008,0.736
11146,7373,Hellboy (2004),2571,"Matrix, The (1999)",0.024,0.734


In [14]:
rules_df[rules_df.source_id == 52281].sort_values(by=['confidence', 'support'], ascending=False)

Unnamed: 0,source_id,source_name,target_id,target_name,support,confidence
13276,52281,Grindhouse (2007),2571,"Matrix, The (1999)",0.015,0.692
13291,52281,Grindhouse (2007),32587,Sin City (2005),0.013,0.623
13277,52281,Grindhouse (2007),47,Seven (a.k.a. Se7en) (1995),0.013,0.613
13290,52281,Grindhouse (2007),5952,Lord of the Rings: The Two...,0.013,0.594
13282,52281,Grindhouse (2007),1089,Reservoir Dogs (1992),0.012,0.58
13275,52281,Grindhouse (2007),33794,Batman Begins (2005),0.012,0.571
13287,52281,Grindhouse (2007),7438,Kill Bill: Vol. 2 (2004),0.012,0.567
13295,52281,Grindhouse (2007),3578,Gladiator (2000),0.012,0.555
13278,52281,Grindhouse (2007),58559,"Dark Knight, The (2008)",0.012,0.552
13297,52281,Grindhouse (2007),1210,Star Wars: Episode VI - Re...,0.012,0.549


In [15]:
rules_df.to_csv('data/association_rules.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)