In [4]:
import os
data_folder = os.path.join(os.path.expanduser("~"), "Data", "ml-100k")
ratings_filename = os.path.join(data_folder, "ratings.csv")


In [5]:
import pandas as pd
all_ratings = pd.read_csv(ratings_filename, header=0, names = ["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]


Unnamed: 0,UserID,MovieID,Rating,Datetime
0,1,31,2.5,2009-12-14 02:52:24
1,1,1029,3.0,2009-12-14 02:52:59
2,1,1061,3.0,2009-12-14 02:53:02
3,1,1129,2.0,2009-12-14 02:53:05
4,1,1172,4.0,2009-12-14 02:53:25


Реализация Априори:

In [6]:
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]


Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,1,1371,2.5,2009-12-14 02:52:15,False
11,1,1405,1.0,2009-12-14 02:53:23,False
12,1,1953,4.0,2009-12-14 02:53:11,True
13,1,2105,4.0,2009-12-14 02:52:19,True
14,1,2150,3.0,2009-12-14 02:53:14,False


In [7]:
all_ratings[all_ratings["UserID"] == 1][:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,1,31,2.5,2009-12-14 02:52:24,False
1,1,1029,3.0,2009-12-14 02:52:59,False
2,1,1061,3.0,2009-12-14 02:53:02,False
3,1,1129,2.0,2009-12-14 02:53:05,False
4,1,1172,4.0,2009-12-14 02:53:25,True


In [8]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))] 

In [9]:
# We start by creating a dataset of each user's favourable reviews
favorable_ratings = ratings[ratings["Favorable"]]
favorable_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
4,1,1172,4.0,2009-12-14 02:53:25,True
8,1,1339,3.5,2009-12-14 02:52:05,True
12,1,1953,4.0,2009-12-14 02:53:11,True
13,1,2105,4.0,2009-12-14 02:52:19,True
20,2,10,4.0,1996-06-21 11:11:33,True


In [10]:
# We are only interested in the reviewers who have more than one review
#favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
#len(favorable_reviews_by_users)
favorable_reviews_by_users={}
for k,v in favorable_ratings.groupby("UserID")["MovieID"]:
    favorable_reviews_by_users[k]=frozenset(v.values)


In [11]:
# Find out how many movies have favourable ratings
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
num_favorable_by_movie.sort_values(by="Favorable",ascending=False)[:5]


Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
296,88.0
318,84.0
356,84.0
593,72.0
527,70.0


In [12]:
from collections import defaultdict
# Мы реализуем вторые и третьи шаги вместе для повышения эффективности , создавая функцию которая принимет свежеобнаруженные  ЧН
# создает супернаборы, и далее проверяет являются ли они ЧН. Вначале, мы задаем функцию и перечсляемый словарь 

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int) 
    
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [13]:
import sys
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50 # define minimum support level, try to var this , to get the most relevans results

#Для реализации первого шага алгоритма, мы создаем набор с каждым фльмом в отдельности и проверяем является  ли 
# набор ЧН . Мы используем frozenset, так как это позволяет нам выполнять набор операций позже, и они могут также использоваться 
# как  ключи в нашем словаре (обычные множества так не могут ).


frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

There are 21 movies with more than 50 favorable reviews
I found 157 frequent itemsets of length 2
I found 590 frequent itemsets of length 3
I found 1250 frequent itemsets of length 4
I found 1596 frequent itemsets of length 5
I found 1279 frequent itemsets of length 6
I found 650 frequent itemsets of length 7
I found 199 frequent itemsets of length 8
I found 32 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


In [14]:
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 5755 frequent itemsets


In [15]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

There are 29188 candidate rules


In [16]:
print(candidate_rules[:5])

[(frozenset({1196}), 2858), (frozenset({2858}), 1196), (frozenset({2571}), 4993), (frozenset({4993}), 2571), (frozenset({1270}), 1210)]


In [17]:
# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

In [18]:
min_confidence = 0.9

In [19]:
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

9245


In [20]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

In [21]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends frozenset({2858, 1196, 1198, 593, 1270, 1210}) they will also recommend 260
 - Confidence: 1.000

Rule #2
Rule: If a person recommends frozenset({318, 1210, 260, 589, 1198}) they will also recommend 1196
 - Confidence: 1.000

Rule #3
Rule: If a person recommends frozenset({4993, 356, 260, 2571, 527, 1270}) they will also recommend 318
 - Confidence: 1.000

Rule #4
Rule: If a person recommends frozenset({260, 2571, 356, 589}) they will also recommend 1196
 - Confidence: 1.000

Rule #5
Rule: If a person recommends frozenset({260, 296, 2858, 2571, 1198, 2959}) they will also recommend 1196
 - Confidence: 1.000



In [22]:
# Even better, we can get the movie titles themselves from the dataset
movie_name_filename = os.path.join(data_folder, "movies.csv")
movie_name_data = pd.read_csv(movie_name_filename, header=0, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Genres"]
movie_name_data[:5]

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [24]:
get_movie_name(4)

'Waiting to Exhale (1995)'

In [25]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends American Beauty (1999), Star Wars: Episode V - The Empire Strikes Back (1980), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Silence of the Lambs, The (1991), Back to the Future (1985), Star Wars: Episode VI - Return of the Jedi (1983) they will also recommend Star Wars: Episode IV - A New Hope (1977)
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Shawshank Redemption, The (1994), Star Wars: Episode VI - Return of the Jedi (1983), Star Wars: Episode IV - A New Hope (1977), Terminator 2: Judgment Day (1991), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Lord of the Rings: The Fellowship of the Ring, The (2001), Forrest Gump (1994), Star Wars: Episode IV - A New Hope (1977), Matrix, The (1999), Schindler's List (1993), Back to the Future 

In [26]:
# Evaluation using test data
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
#test_not_favourable = test_dataset[~test_dataset["Favourable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])
#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby("UserID")["MovieID"])
#test_users = test_dataset["UserID"].unique()

In [27]:
test_dataset[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
27425,200,1,3.0,2015-07-26 17:45:19,False
27426,200,2,3.5,2016-03-11 18:31:48,True
27427,200,32,4.0,2015-07-26 18:16:24,True
27428,200,110,3.5,2015-07-27 17:46:39,True
27429,200,145,4.5,2015-07-26 17:52:34,True


In [28]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [29]:
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}
print(len(test_confidence))

9245


In [30]:
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)
print(sorted_test_confidence[:5])

[((frozenset({1210, 50, 2571, 260, 1270}), 1196), 1.0), ((frozenset({296, 593, 1198, 589, 1270}), 260), 1.0), ((frozenset({356, 2571, 589, 1198, 1270, 1210, 318}), 260), 1.0), ((frozenset({296, 2571, 1198, 2959, 1270, 1210}), 1196), 1.0), ((frozenset({4993, 1210, 2571, 2858, 1198}), 1196), 1.0)]


In [31]:
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
    print("")

Rule #1
Rule: If a person recommends American Beauty (1999), Star Wars: Episode V - The Empire Strikes Back (1980), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), Silence of the Lambs, The (1991), Back to the Future (1985), Star Wars: Episode VI - Return of the Jedi (1983) they will also recommend Star Wars: Episode IV - A New Hope (1977)
 - Train Confidence: 1.000
 - Test Confidence: 1.000

Rule #2
Rule: If a person recommends Shawshank Redemption, The (1994), Star Wars: Episode VI - Return of the Jedi (1983), Star Wars: Episode IV - A New Hope (1977), Terminator 2: Judgment Day (1991), Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Train Confidence: 1.000
 - Test Confidence: 0.968

Rule #3
Rule: If a person recommends Lord of the Rings: The Fellowship of the Ring, The (2001), Forrest Gump (1994), Star Wars: Episode IV - A New Hope (1977), 