In [1]:
import collections
import csv
import itertools

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')

## Association rules

In [3]:
min_item_support = 0.01
min_pair_support = 0.005
min_confidence = 0.5

In [4]:
transactions = set()
for _, rating_group in ratings.groupby(['user_id']):
    transactions.add(frozenset(rating_group.movie_id.tolist()))
print(len(transactions))

138493


In [5]:
def find_frequent_items(transactions, min_support):
    item_counter = collections.Counter([item_id for transaction in transactions for item_id in transaction])
    min_count = len(transactions) * min_support
    frequent_items = {}
    for item_id, item_count in item_counter.items():
        if item_count >= min_count:
            frequent_items[item_id] = item_count
    return frequent_items
frequent_movies = find_frequent_items(transactions, min_item_support)
len(frequent_movies)

2233

In [6]:
def find_frequent_pairs(transactions, frequent_items, min_support):
    frequent_item_ids = set(frequent_items.keys())
    pair_counts = collections.defaultdict(int)
    min_count = len(transactions) * min_support
    frequent_pairs = {}
    i = 1
    for transaction in transactions:
        if i % 1000 == 0:
            if i % 10000 == 0:
                print('{0:0.1f}%'.format(100.0 * i / len(transactions)), end='')
            else:
                print('.', end='')
        i += 1
        frequent_transaction_items = transaction.intersection(frequent_item_ids)
        for pair in itertools.combinations(frequent_transaction_items, 2):
            pair_counts[pair] += 1
    for pair, pair_count in pair_counts.items():
        if pair_count >= min_count:
            frequent_pairs[pair] = pair_count
    return frequent_pairs
frequent_movie_pairs = find_frequent_pairs(transactions, frequent_movies, min_pair_support)
len(frequent_movie_pairs)

.........7.2%.........14.4%.........21.7%.........28.9%.........36.1%.........43.3%.........50.5%.........57.8%.........65.0%.........72.2%.........79.4%.........86.6%.........93.9%........

669612

In [7]:
def calculate_association_rules(frequent_items, frequent_pairs, n_transactions):
    rules = []
    for source, source_freq in frequent_items.items():
        for pair, pair_freq in frequent_pairs.items():
            if source in pair:
                target = list(set(pair).difference(set([source])))[0]
                support = 1.0 * pair_freq / n_transactions
                confidence = 1.0 * pair_freq / source_freq
                if confidence > min_confidence:
                    rules.append((source, target, support, confidence))
    return rules
rules = calculate_association_rules(frequent_movies, frequent_movie_pairs, len(transactions))
len(rules)

35723

In [8]:
rules_dict = []
movie_names_map = dict(zip(movies.movie_id.tolist(), movies.title.tolist()))
for r in rules:
    rules_dict.append({
        'source_id': r[0],
        'source_name': movie_names_map[r[0]],
        'target_id': r[1],
        'target_name': movie_names_map[r[1]],
        'support': r[2],
        'confidence': r[3]})
rules_df = pd.DataFrame(rules_dict)[['source_id', 'source_name', 'target_id', 'target_name', 'support', 'confidence']]
rules_df.sample(30)

Unnamed: 0,source_id,source_name,target_id,target_name,support,confidence
33460,47200,Crank (2006),1198,Raiders of the Lost Ark (I...,0.006,0.511
25706,30822,In Good Company (2004),1,Toy Story (1995),0.006,0.598
32080,5452,Look Who's Talking Now (1993),260,Star Wars: Episode IV - A ...,0.007,0.573
29710,1082,"Candidate, The (1972)",1198,Raiders of the Lost Ark (I...,0.006,0.535
4122,4239,Blow (2001),110,Braveheart (1995),0.022,0.509
34913,2644,Dracula (1931),1219,Psycho (1960),0.007,0.548
18713,69481,"Hurt Locker, The (2008)",70286,District 9 (2009),0.012,0.518
31440,6565,Seabiscuit (2003),7153,Lord of the Rings: The Ret...,0.016,0.566
21730,44022,Ice Age 2: The Meltdown (2...,47,Seven (a.k.a. Se7en) (1995),0.007,0.518
12283,8914,Primer (2004),780,Independence Day (a.k.a. I...,0.008,0.535


In [9]:
rules_df.to_csv('data/association_rules.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)