In [1]:
# import libraries and run a few checks
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import pandas as pd
import os
from mlxtend.frequent_patterns import apriori, association_rules

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import rcParams
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

import seaborn as sns
sns.set(palette='Paired', style='white')

import warnings 
warnings.filterwarnings('ignore')

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

# utility function to save figures for presentation
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure:", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Load Dataset

In [43]:
dataset = pd.read_csv('TMBD Movie Dataset.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,...,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,profit,popularity_level
0,0,135397,tt0369610,32.985763,150000000.0,1513529000.0,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,...,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,5562,6.5,2015,137999900.0,1392446000.0,1363529000.0,High
1,1,76341,tt1392190,28.419936,150000000.0,378436400.0,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,...,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,6185,7.1,2015,137999900.0,348161300.0,228436400.0,High
2,2,262500,tt2908446,13.112507,110000000.0,295238200.0,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,...,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015-03-18,2480,6.3,2015,101200000.0,271619000.0,185238200.0,High
3,3,140607,tt2488496,11.173104,200000000.0,2068178000.0,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,...,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015-12-15,5292,7.5,2015,183999900.0,1902723000.0,1868178000.0,High
4,4,168259,tt2820852,9.335014,190000000.0,1506249000.0,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,...,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015-04-01,2947,7.3,2015,174799900.0,1385749000.0,1316249000.0,High


### Data Preprocess

In [4]:
cast_split = dataset['cast'].str.split('|', expand=True).fillna('')
cast_cols = [f'cast_{i+1}' for i in range(cast_split.shape[1])]

director_split = dataset['director'].str.split('|', expand=True).fillna('')
director_cols = [f'director_{i+1}' for i in range(director_split.shape[1])]

genres_split = dataset['genres'].str.split('|', expand=True).fillna('')
genres_cols = [f'genres_{i+1}' for i in range(genres_split.shape[1])]

companies_split = dataset['production_companies'].str.split('|', expand=True).fillna('')
companies_cols = [f'production_company_{i+1}' for i in range(companies_split.shape[1])]

processed_dataset = pd.concat([cast_split, director_split, genres_split, companies_split], axis=1)
processed_dataset.columns = cast_cols + director_cols + genres_cols + companies_cols

processed_dataset.to_csv("processed_dataset2.csv", index=False)

In [6]:
df = pd.read_csv("processed_dataset2.csv")

movie_information =  [tuple(cell for cell in row if pd.notnull(cell)) for row in df.values]

In [14]:
print(movie_information[:5])

[('Chris Pratt', 'Bryce Dallas Howard', 'Irrfan Khan', "Vincent D'Onofrio", 'Nick Robinson', 'Colin Trevorrow', 'Action', 'Adventure', 'Science Fiction', 'Thriller', 'Universal Studios', 'Amblin Entertainment', 'Legendary Pictures', 'Fuji Television Network', 'Dentsu'), ('Tom Hardy', 'Charlize Theron', 'Hugh Keays-Byrne', 'Nicholas Hoult', 'Josh Helman', 'George Miller', 'Action', 'Adventure', 'Science Fiction', 'Thriller', 'Village Roadshow Pictures', 'Kennedy Miller Productions'), ('Shailene Woodley', 'Theo James', 'Kate Winslet', 'Ansel Elgort', 'Miles Teller', 'Robert Schwentke', 'Adventure', 'Science Fiction', 'Thriller', 'Summit Entertainment', 'Mandeville Films', 'Red Wagon Entertainment', 'NeoReel'), ('Harrison Ford', 'Mark Hamill', 'Carrie Fisher', 'Adam Driver', 'Daisy Ridley', 'J.J. Abrams', 'Action', 'Adventure', 'Science Fiction', 'Fantasy', 'Lucasfilm', 'Truenorth Productions', 'Bad Robot'), ('Vin Diesel', 'Paul Walker', 'Jason Statham', 'Michelle Rodriguez', 'Dwayne John

In [44]:
from efficient_apriori import apriori
from src.utils import support, confidence, show_top_rules

items, rules = apriori(movie_information, min_support=0.02, min_confidence=0.5)
# print(rules)
for r in rules:
    print('Rule [{} => {}] (support: {}, confidence: {}, lift: {})'.format(r.lhs, r.rhs, r.support, r.confidence, r.lift))


Rule [('Adventure',) => ('Action',)] (support: 0.14374514374514374, confidence: 0.622895622895623, lift: 1.931726907630522)
Rule [('Science Fiction',) => ('Action',)] (support: 0.10955710955710955, confidence: 0.6746411483253588, lift: 2.0922003804692455)
Rule [('Thriller',) => ('Action',)] (support: 0.16006216006216006, confidence: 0.5162907268170426, lift: 1.6011232901530934)
Rule [('Twentieth Century Fox Film Corporation',) => ('Action',)] (support: 0.022533022533022532, confidence: 0.5471698113207547, lift: 1.6968856558308707)
Rule [('Animation',) => ('Adventure',)] (support: 0.042735042735042736, confidence: 0.5092592592592593, lift: 2.2067901234567904)
Rule [('Fantasy',) => ('Adventure',)] (support: 0.06682206682206682, confidence: 0.5584415584415584, lift: 2.41991341991342)
Rule [('Animation',) => ('Comedy',)] (support: 0.04895104895104895, confidence: 0.5833333333333334, lift: 1.8134057971014494)
Rule [('Family',) => ('Animation',)] (support: 0.07459207459207459, confidence: 0.

In [37]:
# Run A
show_top_rules(movie_information, min_support=0.15, min_confidence=0.2, k=10, id_map=None)

=== Total Number of Rules: 2 ===
(Thriller) => (Action)  [s: 0.16, c: 0.52, l: 1.60]
(Action) => (Thriller)  [s: 0.16, c: 0.50, l: 1.60]



In [33]:
# RunB
show_top_rules(movie_information, min_support=0.08, min_confidence=0.2, k=10, id_map=None)

=== Total Number of Rules: 18 ===
(Science Fiction) => (Action)  [s: 0.11, c: 0.67, l: 2.09]
(Action) => (Science Fiction)  [s: 0.11, c: 0.34, l: 2.09]
(Thriller) => (Crime)  [s: 0.09, c: 0.31, l: 2.04]
(Crime) => (Thriller)  [s: 0.09, c: 0.63, l: 2.04]
(Adventure) => (Action)  [s: 0.14, c: 0.62, l: 1.93]
(Action) => (Adventure)  [s: 0.14, c: 0.45, l: 1.93]
(Romance) => (Comedy)  [s: 0.08, c: 0.55, l: 1.71]
(Comedy) => (Romance)  [s: 0.08, c: 0.26, l: 1.71]
(Romance) => (Drama)  [s: 0.11, c: 0.70, l: 1.69]
(Drama) => (Romance)  [s: 0.11, c: 0.26, l: 1.69]
(Thriller) => (Action)  [s: 0.16, c: 0.52, l: 1.60]



In [34]:
# Run C
show_top_rules(movie_information, min_support=0.15, min_confidence=0.8, k=10, reverse=True, id_map=None)

=== Total Number of Rules: 0 ===



In [35]:
# Run D
show_top_rules(movie_information, min_support=0.08, min_confidence=0.8, k=10, reverse=True, id_map=None)

=== Total Number of Rules: 0 ===

