In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from collections import Counter

In [39]:
#Data Preprocessing
rating=pd.read_csv('ratings.csv')
movie=pd.read_csv('movies.csv')

#merging two csv file for creating transaction data
data=pd.merge(rating,movie,on='movieId')
data=data.sort_values('userId')

#movies rated above 2
data=data[data['rating']>2]

# Creating dictionary for transactional data set
data_dict = {}
for i, row in data.iterrows():
    if row['userId'] not in data_dict:
        data_dict[row['userId']] = []
    data_dict[row['userId']].append(row['title'])  

#Users who have rated more than 10 movies
user_list = list(data_dict.keys())
for user in user_list:
    if len(data_dict[user]) <= 10:
        del(data_dict[user])  

#Creatin train and test dataset
trainset={}
testset={}
for i, val in data_dict.items():
    if i not in trainset:
        trainset[i] = []
    if i not in testset:
        testset[i] = []
    for j, mov in enumerate(val):
        if j < int(0.8 * len(val)):
            trainset[i].append(mov)
        else:
            testset[i].append(mov)

In [43]:
#Association rules by apriori 
#Creating the individual movie items list from train set
movieList=[]
for i, val in trainset.items():
    for q in val:
        if q not in movieList:
            movieList.append(q)
print(len(movieList))

8201


In [53]:
#Choosing support value
sp =  0.01
s = int(sp*len(movieList))

#Choosing confidence value
conf=0.1

In [45]:
#APriori
#Generating frequent 1-itemset
c = Counter()
for i in movieList:
    for j, val in trainset.items():
        if(i in val):
            c[i]+=1
            
print("C1:")
# for i in c:
#     print(str([i])+": "+str(c[i]))


C1:


In [46]:
#frequent 1-itemset satisfying minimunm support
l = Counter()
for i in c:
    if(c[i] >= s):
        l[frozenset([i])]+=c[i]
print("L1:")
# for i in l:
#     print(str(list(i))+": "+str(l[i]))
print()

L1:



In [47]:
#Generating subsequent itemsets by pruning
pl = l
pos = 1
for count in range (2,1000):
    nc = set()
    temp = list(l)
    for i in range(0,len(temp)):
        for j in range(i+1,len(temp)):
            t = temp[i].union(temp[j])
            if(len(t) == count):
                nc.add(temp[i].union(temp[j]))
    nc = list(nc)
    c = Counter()
    for i in nc:
        c[i] = 0
        for j, val in trainset.items():
            temp = set(val)
            if(i.issubset(temp)):
                c[i]+=1
    print("C"+str(count)+":")
    for i in c:
        print(str(list(i))+": "+str(c[i]))
    print()

#Pruning that does not satisfying minimum support 
    l = Counter()
    for i in c:
        if(c[i] >= s):
            l[i]+=c[i]
    print("L"+str(count)+":")
    for i in l:
        print(str(list(i))+": "+str(l[i]))
    print()
    if(len(l) == 0):
        break
    pl = l
    pos = count
print("Result: ")
print("L"+str(pos)+":")
for i in pl:
    print(str(list(i))+": "+str(pl[i]))
print()

C2:
['Green Mile, The (1999)', 'Clockwork Orange, A (1971)']: 36
['Beautiful Mind, A (2001)', 'Catch Me If You Can (2002)']: 47
['Die Hard (1988)', 'Jurassic Park (1993)']: 56
['Godfather, The (1972)', 'American History X (1998)']: 49
['Pirates of the Caribbean: The Curse of the Black Pearl (2003)', 'Willy Wonka & the Chocolate Factory (1971)']: 31
['Ghost (1990)', 'Truman Show, The (1998)']: 18
['Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Truman Show, The (1998)']: 48
["Ocean's Eleven (2001)", 'Seven (a.k.a. Se7en) (1995)']: 41
['Green Mile, The (1999)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']: 39
['Apollo 13 (1995)', 'Fight Club (1999)']: 37
['Léon: The Professional (a.k.a. The Professional) (Léon) (1994)', 'X-Men (2000)']: 28
['Dances with Wolves (1990)', 'American History X (1998)']: 29
['Indiana Jones and the Last Crusade (1989)', 'Princess Bride, The (1987)']: 53
['Men in Black (a.k.a. MIB) (1997)', 'Titanic (1997)']: 45
['Alien (1979)', 'GoldenEye (1995)']

In [56]:
#Generating association rules
from itertools import combinations
c=[]
for l in pl:
    c = [frozenset(q) for q in combinations(l,len(l)-1)]
    # print()
    # print(c) #list of subsets in frequent itmemset
    # print()
    for a in c: #Traversing each subset
        b = l-a
        ab = l
        sab = 0
        sa = 0
        sb = 0
        for key,value in trainset.items(): #Checking each subset is present in training set
            temp = set(value)
            if(a.issubset(temp)):
                sa+=1
            if(b.issubset(temp)):
                sb+=1
            if(ab.issubset(temp)):
                sab+=1
        temp = sab/sa*100  #Calculating the confidence for assocaition rule a--> b
        if(temp > conf):
            print(str(list(a))+" -> "+str(list(b))+" = "+str(sab/sa*100)+"%")
            print()
        temp = sab/sb*100 #Calculating the confidence for each assocaition rule b--> a
        if(temp > conf):
            print(str(list(b))+" -> "+str(list(a))+" = "+str(sab/sb*100)+"%")
            print()
    
   

['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)'] -> ['Matrix, The (1999)'] = 65.35433070866141%

['Matrix, The (1999)'] -> ['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)'] = 40.29126213592233%

['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Matrix, The (1999)'] -> ['Star Wars: Episode IV - A New Hope (1977)'] = 79.8076923076923%

['Star Wars: Episode IV - A New Hope (1977)'] -> ['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Matrix, The (1999)'] = 40.88669950738916%

['Star Wars: Episode IV - A New Hope (1977)', 'Matrix, The (1999)'] -> ['Star Wars: Episode V - The Empire Strikes Back (1980)'] = 73.45132743362832%

['Star Wars: Episode V - The Empire Strikes Back (1980)'] -> ['Star Wars: Episode IV - A New Hope (1977)', 'Matrix, The (1999)'] = 50.0%

['Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)'] -> ['Forrest Gump (1994)'] = 66.41221374045801%
