In [1]:
!git clone https://github.com/ellkrauze/association_rule_mining

fatal: destination path 'association_rule_mining' already exists and is not an empty directory.


In [2]:
!pip install mlxtend --upgrade



In [3]:
!pip install memory-profiler



In [4]:
!pip install pandas



## Association Rule Mining

In [62]:
import pandas as pd
import numpy as np
import os
import csv
import argparse
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import timeit
from memory_profiler import memory_usage
import time

In [20]:
# Map for converting data to 0 and 1
# x is element of map to be converted
def encode_data(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

In [21]:
# Read csv file as list of strings
# path is directory of file
# sep is csv separator (default=',')
def load_csv(path, sep=","):
    data = []
    with open(os.path.join(path), 'r') as file:
        reader = csv.reader(file, delimiter=sep)
        data = list(reader)
    # Remove empty strings from list
    for row in data:
        while('' in row):
            row.remove('')
    return data

In [22]:
# Transform the input dataset into a one-hot encoded NumPy boolean array
# data is input dataset in list-format
def transform_onehot(data):
    te = TransactionEncoder()
    data = pd.DataFrame(te.fit(data).transform(data), columns=te.columns_)
    return data

In [24]:
# Load and convert datasets to format 
# which is needed for Apriori algorithm
def load_data():
    ratings_df = pd.read_csv(os.path.join('association_rule_mining', 'datasets','ratings_small.csv'))
    movies_df = pd.read_csv(os.path.join('association_rule_mining', 'datasets','movies_metadata.csv'))

    # Clean data
    title_mask = movies_df['title'].isna()
    movies_df = movies_df.loc[title_mask == False]

    # Convert the string datatype of id column 
    # of movies dataframe to int as that in the 
    # ratings dataframe
    movies_df = movies_df.astype({'id': 'int64'})

    # Merge movies and ratings dataframes
    df = pd.merge(ratings_df, movies_df[['id', 'title']], left_on='movieId', right_on='id')

    # Id column is repeated and the timestamp is 
    # not important for this problem.
    # Drop the two
    df.drop(['timestamp', 'id'], axis=1, inplace=True)

    # Make sure there are no duplicate records 
    # for the combination of userId and title
    df = df.drop_duplicates(['userId','title'])

    # The apriori model needs data in a format 
    # such that the userId forms the index
    df_pivot = df.pivot(index='userId', columns='title', values='rating').fillna(0)

    # You need to convert the ratings to 0 or 1 
    # and also convert all float values to int
    df_pivot = df_pivot.astype('int64')
    df_pivot = df_pivot.applymap(encode_data)
    return df_pivot

In [61]:
# Get frequent itemsets from single dataset 
# with Apriori algorithm
#
# data is input dataset
# minsup is minimal support
# mode is optional way to order the resulting list of itemsets
## @profile
def find_frequent_itemset(data, minsup, mode="None"):
    # by passing minsup we mean the number of times in total number
    # of transactions the item should be present
    frequent_itemsets = apriori(data, min_support=minsup, use_colnames=True)
    if mode == "descending-support":
      return frequent_itemsets.sort_values(by=['support'], ascending=False)
    elif mode == "itemsets":
      return frequent_itemsets.sort_values(by=['itemsets'], ascending=True)
    else:
      return frequent_itemsets

In [59]:
# Get association rules from frequent itemsets
# with Apriori algorithm
#
# data is dataset
# minsup is minimal support value
# minconf is minimal confidence value 
# mode is optional way to order the resulting list of rules
## @profile
def find_association_rules(freq_itemset, minconf, mode="None"):
    result = association_rules(freq_itemset, metric="confidence", min_threshold=minconf)
    if mode == "descending-support":
        return result.sort_values(by=['support'], ascending=False)
    elif mode == "antecedents":
        result['len_ant'] = result['antecedents'].str.len()
        result['len_cons'] = result['consequents'].str.len()
        return result.sort_values(by=['lift', 'len_ant','len_cons'], ascending=[False, True, True]).drop(columns=['len_ant','len_cons'])
    else:
        return result

In [26]:
# For movies recommendation
df = load_data()

# For retail
# df = load_csv('datasets/retail.dat', sep=' ')
# df = transform_onehot(df)


  


In [57]:
minsup = 0.1
minconf = 0.95
# mode = "antecedents"

# Measure time
frequent_itemset = find_frequent_itemset(df, minsup, mode="None")
origin_time = time.time()
association_rules = find_association_rules(frequent_itemset, minconf, "antecedents")
current_spent_time = time.time() - origin_time
print(f'Execution time: {current_spent_time} seconds')

Execution time: 0.10109972953796387 seconds


In [32]:
association_rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
163,"(Cockles and Muscles, A Nightmare on Elm Stree...","(Monsoon Wedding, Sissi)",0.105812,0.23696,0.101341,0.957746,4.041811,0.076268,18.058619
57,"(Titanic, Back to the Future Part II, Rain Man)",(A Nightmare on Elm Street),0.105812,0.266766,0.101341,0.957746,3.590212,0.073114,17.353204
94,"(Lost in Translation, Sissi, Bang, Boom, Bang)",(The Conversation),0.104322,0.292101,0.101341,0.971429,3.325656,0.070869,24.776453
93,"(Lost in Translation, Bang, Boom, Bang, Rain Man)",(The Conversation),0.116244,0.292101,0.111773,0.961538,3.291797,0.077818,18.405365
10,"(Lost in Translation, Grill Point)",(The Conversation),0.114754,0.292101,0.110283,0.961039,3.290087,0.076763,18.169399
33,"(Lost in Translation, 48 Hrs., Rain Man)",(The Conversation),0.108793,0.292101,0.104322,0.958904,3.282779,0.072543,17.225534
88,"(Lost in Translation, Back to the Future Part ...",(The Conversation),0.108793,0.292101,0.104322,0.958904,3.282779,0.072543,17.225534
89,"(Terminator 3: Rise of the Machines, Lost in T...",(The Conversation),0.108793,0.292101,0.104322,0.958904,3.282779,0.072543,17.225534
87,"(Cockles and Muscles, Terminator 3: Rise of th...",(Rain Man),0.107303,0.293592,0.102832,0.958333,3.264171,0.071328,16.9538
8,"(Beetlejuice, The Million Dollar Hotel)",(License to Wed),0.107303,0.298063,0.104322,0.972222,3.261806,0.072339,25.269747


In [63]:
minsup = 0.1
minconf = 0.7
# mode = "antecedents"

df = load_data()

# To use this code, reload following functions' declarations and imports: 
# find_frequent_itemset and association_rules
frequent_itemset = find_frequent_itemset(df, minsup, mode="None")
res = association_rules(frequent_itemset, metric="confidence", min_threshold=minconf)

  """


In [65]:
res.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Back to the Future Part II),(48 Hrs.),0.210134,0.298063,0.149031,0.70922,2.379433,0.086398,2.41398
1,(Cockles and Muscles),(48 Hrs.),0.171386,0.298063,0.129657,0.756522,2.53813,0.078573,2.882957
2,(Dave Chappelle's Block Party),(48 Hrs.),0.162444,0.298063,0.119225,0.733945,2.462385,0.070807,2.638316
3,(48 Hrs.),(Monsoon Wedding),0.298063,0.406855,0.213115,0.715,1.757381,0.091846,2.081209
4,(Ocean's Eleven),(48 Hrs.),0.137109,0.298063,0.101341,0.73913,2.479783,0.060474,2.69076


In [78]:
df_MIB = res[res['antecedents'].apply(lambda x: len(x) ==1 and next(iter(x)) == 'Back to the Future Part II')]

In [79]:
df_MIB = df_MIB[df_MIB['lift'] > 2]

In [80]:
df_MIB.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Back to the Future Part II),(48 Hrs.),0.210134,0.298063,0.149031,0.70922,2.379433,0.086398,2.41398
50,(Back to the Future Part II),(Rain Man),0.210134,0.293592,0.166915,0.794326,2.705548,0.105221,3.434606
51,(Back to the Future Part II),(Sissi),0.210134,0.315946,0.165425,0.787234,2.49167,0.099034,3.215052
54,(Back to the Future Part II),(The Conversation),0.210134,0.292101,0.159463,0.758865,2.597952,0.098083,2.935697
55,(Back to the Future Part II),(The Hours),0.210134,0.301043,0.152012,0.723404,2.402991,0.088752,2.526998
