In [1]:
import collections
import itertools

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')

## Generate frequent item sets

In [3]:
transactions = []
for k, g in ratings.groupby(['user_id']):
    transactions.append(g.movie_id.tolist())
print(len(transactions))

138493


In [4]:
def calculate_itemsets_one(ts, min_sup_ratio):
    tmp = collections.defaultdict(int)
    ret = {}
    for t in ts:
        for movie_id in t:
            tmp[frozenset({movie_id})] += 1
    for movie_id, cnt in tmp.items():
        if cnt >= len(ts) * min_sup_ratio:
            ret[movie_id] = cnt
    return ret
frequent_items = calculate_itemsets_one(transactions, 0.08)
len(frequent_items)

293

In [5]:
def calculate_itemsets_two(ts, one_itemsets, min_sup_ratio):
    one_items= [list(k)[0] for k in one_itemsets.keys()]
    tmp = collections.defaultdict(int)
    ret = {}
    for t in ts:
        tf = list(set(t).intersection(set(one_items)))
        if len(tf) >= 2:
            for pair in itertools.combinations(tf, 2):
                if pair[0] in one_items and pair[1] in one_items:
                    tmp[frozenset(pair)] += 1
    for pair, cnt in tmp.items():
        if cnt >= len(ts) * min_sup_ratio:
            ret[pair] = cnt
    return ret
frequent_pairs = calculate_itemsets_two(transactions, frequent_items, 0.08)
len(frequent_pairs)

1744

In [6]:
def calculate_association_rules(one_itemsets, two_itemsets, n_transactions):
    ret = []
    for source, source_freq in one_itemsets.items():
        for key, group_freq in two_itemsets.items():
            if source.issubset(key):
                target = key.difference(source)
                support = 1.0 * group_freq / n_transactions
                confidence = 1.0 * group_freq / source_freq
                ret.append((next(iter(source)), next(iter(target)), confidence, support))
    return ret
rules = calculate_association_rules(frequent_items, frequent_pairs, len(transactions))
len(rules)

3488

## People also watched

In [10]:
movies_map = dict(zip(movies.movie_id.tolist(), movies.title.tolist()))
for r in rules:
    if r[2] > 0.65:
        print('{0:40.40s}   --->  {1:40.40s}'.format(movies_map[r[0]], movies_map[r[1]]))

Jumanji (1995)                             --->  Jurassic Park (1993)                    
Seven (a.k.a. Se7en) (1995)                --->  Pulp Fiction (1994)                     
Indiana Jones and the Last Crusade (1989   --->  Raiders of the Lost Ark (Indiana Jones a
Full Metal Jacket (1987)                   --->  Pulp Fiction (1994)                     
Mask, The (1994)                           --->  Jurassic Park (1993)                    
Mask, The (1994)                           --->  Forrest Gump (1994)                     
Star Wars: Episode V - The Empire Strike   --->  Star Wars: Episode IV - A New Hope (1977
Aliens (1986)                              --->  Star Wars: Episode V - The Empire Strike
Aliens (1986)                              --->  Alien (1979)                            
Lord of the Rings: The Two Towers, The (   --->  Lord of the Rings: The Fellowship of the
Léon: The Professional (a.k.a. The Profe   --->  Pulp Fiction (1994)                     
Terminator