In [5]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# https://matplotlib.org/examples/
%matplotlib inline

"""
List of Imports
"""

from csv import reader
from math import sqrt
import numpy as np
import math
import json #json viewer: http://jsonviewer.stack.hu 
from __future__ import division
from itertools import imap
import matplotlib.pyplot as plt
import copy

"""
List of shared global objects
"""
movies = {} # Dictionary for a list of all movies (movie_id, movie_title)
prefs = {} # Dictionary for a list of all users with all their rated movies 

In [6]:
#%run 'Handouts/2ControlAndLooping.ipynb'

In [7]:
class Similarity:
    """
    Base class for all similarity functions
    """

    score = 0.0
    sim_method = 1
    
    def __init__(self, i):
        self.score = 0.0
        self.sim_method = i
        self.formula = {1: self.Euclidean_Distance_Score, 2: self.Pearson_Correlation_Score}
    
    def Normalised_Correlation(p , q):
        return
    
    def Euclidean_Distance_Score(self, p1, p2):
        """
        Returns Euclidean Distance Score, a distance-based similarity score for person1 and person2.
        Formula: 1/(1+sqrt(pow(5-4,2)+pow(4-1,2)))
        Input: prefs dictionary, p1 person1, p2 person2
        Output: a value between 0 and 1, where 1 means two people have identical preferences. 
        """
        si = self.get_shared_items(p1, p2) # Get the list of shared_items
        
        # If they have no ratings in common, return 0
        if len(si) == 0: 
            return 0
        
        # Add up the squares of all the differences
        sum_of_squares = sum([pow(p1[item] - p2[item], 2) for item in si])
        return 1 / (1 + sqrt(sum_of_squares))
    
    def Pearson_Correlation_Score(self, p1, p2):
        """
        Returns the Pearson Correlation Coefficient for p1 and p2
        Input: prefs dictionary, p1 person1, p2 person2
        Output: a value between –1 and 1. 1 means the two people have exactly same ratings for every item. 
        """
        si = self.get_shared_items(p1, p2) # Get the list of shared_items
        n = len(si)
        
        # if they are no ratings in common, return 0
        if n == 0: return 0
        
        # Add up all the preferences
        sum1 = sum([p1[it] for it in si])
        sum2 = sum([p2[it] for it in si])
        
        # Sum up the squares
        sum1Sq = sum([p1[it]**2 for it in si])
        sum2Sq = sum([p2[it]**2 for it in si])
        
        # Sum up the products
        pSum = sum([p1[it]*p2[it] for it in si])
        
        # Calculate r (Pearson score)
        num = pSum - sum1 * sum2 / n
        den = sqrt((sum1Sq - sum1**2 / n) * (sum2Sq - sum2**2 / n))   #over average
        
        if den == 0:
            return 0
        r = num / den
        return r

    #Spearman coefficient
    def Spearman_Coefficient_Score(self, p1, p2):
        """
        Returns the Spearman Coefficient Score for p1 and p2
        Input: prefs dictionary, p1 person1, p2 person2
        Output: a value between –1 and 1. 1 means the two people have exactly same ratings for every item. 
        """
        si = self.get_shared_items(p1, p2) # Get the list of shared_items
        
        # if they are no ratings in common, return 0
        n = len(si)
        if n == 0: return 0
        
        # Add up all the preferences
        sum1 = sum([p1[it] for it in si])
        sum2 = sum([p2[it] for it in si])
        
        # Sum up the squares
        sum1Sq = sum([p1[it]**2 for it in si])
        sum2Sq = sum([p2[it]**2 for it in si])
        
        # Sum up the products
        pSum = sum([p1[it]*p2[it] for it in si])
        
        # Calculate r (Pearson score)
        num = pSum - sum1 * sum2 / n
        den = sqrt((sum1Sq - sum1**2 / n) * (sum2Sq - sum2**2 / n))   #over average
        
        if den == 0:
            return 0
        r = num / den
        return r
    
    #Kendall coefficient
    
    #Tanimoto coefficient
    def Tanimoto_Coefficient(self, v1, v2):
        """
        Returns the Tanimoto Coefficient for p1 and p2
        Input: prefs dictionary, p1 person1, p2 person2
        Output: a value between –1 and 1. 1 means the two people have exactly same ratings for every item. 
        """
        (c1, c2, shr) = (0, 0, 0)
        for i in range(len(v1)):
            if v1[i] != 0:  # in v1
                c1 += 1
            if v2[i] != 0:  # in v2
                c2 += 1
            if v1[i] != 0 and v2[i] != 0:  # in both
                shr += 1

        return 1.0 - float(shr) / (c1 + c2 - shr)
    
    def get_shared_items(self, d1, d2):
        si = {} 
        for item in d1:
            if item in d2:
                si[item] = 1
        return si
        
    def get_score(self):
         return self.score
        
    # calculates the mean
    def set_sim_method(self,x):
        self.sim_method = x

    def calc(self, prefs, p1, p2):
        return self.formula[self.sim_method](prefs[p1], prefs[p2])
    
    # calculates the mean
    def mean(self,x):
        return sum(i for i in x) / len(x) 

    # calculates the sample standard deviation
    def standard_deviation(self,x):
        sumv = 0.0
        for i in x:
             sumv += (i - self.mean(x))**2
        return math.sqrt(sumv/(len(x)-1))

    # calculates the PCC using both the 2 functions above
    def pearson(self,x,y):
        """
        x=[prefs[p1][item] for item in si]
        y=[prefs[p2][item] for item in si]
        """
        print len(x) , len(y)
        #scorex = [(i - self.mean(x)) / self.standard_deviation(x) for i in x]
        #scorey = [(j - self.mean(y)) / self.standard_deviation(y) for j in y]

        # multiplies both lists together into 1 list (hence zip) and sums the whole list   
        return (sum([i*j for i,j in zip(scorex,scorey)])) / (len(x)-1)
    
    def draw2d(data, labels, jpeg='mds2d.jpg'):
        img = Image.new('RGB', (2000, 2000), (255, 255, 255))
        draw = ImageDraw.Draw(img)
        for i in range(len(data)):
            x = (data[i][0] + 0.5) * 1000
            y = (data[i][1] + 0.5) * 1000
            draw.text((x, y), labels[i], (0, 0, 0))
        img.save(jpeg, 'JPEG')
    
    def to_json(self, obj, file_name): #to_json(movies, "movies")
        """
        Dump dict obj into json file
        """
        with open(file_name + '.json', 'w') as fp:
            json.dump(obj, fp)
    
    def to_csv(self, dic, filename="file.csv"):
        """
        Dump dict data into csv file
        """
        csv = csv = open(filename, 'a')
        #dic = {"John": "john@example.com", "Mary": "mary@example.com"} #dictionary    
        for key in dic.keys():
            name = key
            score = dic[key]
            row = '"{}"'.format(str(name)) + "," + str(score) + "\n"
            csv.write(row)

similarity = Similarity(1) #Similarity class with all similarity formulas

In [8]:
def load_movies(path='ml-latest-small/movies.csv'):
    """
    Load The List of Movies / Get movie id and titles
    """
    movies_csv_file = reader(open(path, 'r'))
    next(movies_csv_file) #skip header
    for line in movies_csv_file:
        (id, title) = line[0:2]  #print "%s : %s " % (id,title)
        movies[id] = title

def load_movie_ratings(path='ml-latest-small/ratings.csv'):
    """
    Load all movie ratings
    """
    #global prefs
    ratings_csv_file = reader(open(path, 'r'))
    next(ratings_csv_file) #skip header
    for line in ratings_csv_file:
        (user_id, movie_id, rating, timestamp) = line  #print "%s : %s " % (id,title)
        prefs.setdefault(user_id, {}) 
        prefs[user_id][movies[movie_id]] = float(rating) #print rating
    
def transform(prefs): #prefs = transform(prefs)
    """
    Transform the recommendations into a mapping where persons are described with
    interest scores for a given title e.g. {title: person} instead of {person: title}.
    """    
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result
    
#print sim.calc(prefs, "10", "500", 1)
#for i in sim.get_shared_items(prefs, "10", "500"): print prefs["10"][i], prefs["500"][i]

def plot_top_matches(prefs, person, n=5):
    """
    Returns graph for the best matches for person from the prefs dictionary. 
    """
    scores = [(similarity.calc(prefs, person, other), other) for other in prefs if other != person]
    #scores.sort()
    #scores.reverse()
    #get scores as dict
    scores = scores[1:200]
    scores = { b:a for (a,b)  in scores }
    #scores = { similarity.calc(prefs, person, other) : other for other in prefs if other != person }
    plt.bar(range(len(scores)), scores.values(), align='center')
    plt.xticks(range(len(scores)), scores.keys())

#get similarity scores/top matched users for user 1 using similiarty scoring 1
def top_similar_matches(prefs, person, n=500):
    """
    Returns the top similar users for a person in the prefs dictionary. 
    Number of results and similarity function are optional params.
    """
    scores = [(similarity.calc(prefs, person, other), other) for other in prefs if other != person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

#collaborative filtering methods: user-based
def get_recommendations_user_based_filtering(prefs, person): #recommend movies
    """
    Gets recommendations for a person by using a weighted average of every other user's rankings
    """
    totals = {}
    simSums = {}
    for other in prefs:
        if other == person: continue # Don't compare me to myself
        sim = similarity.calc(prefs, person, other) #calculate similarity score between my_user and all others
        if sim <= 0: continue # Ignore scores of zero or lower
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0: # Only score movies I haven't seen yet
                totals.setdefault(item, 0) # Similarity * Score
                # Final score is calculated by multiplying each item by the similarity and sum all these products 
                totals[item] += prefs[other][item] * sim
                simSums.setdefault(item, 0) # Sum of similarities scores between me and all others
                simSums[item] += sim
    
    # Create the normalized list
    rankings = [(round(total / simSums[item], 1) , item) for (item, total) in totals.items()] 
    #rankings = {round(total / simSums[item], 1) : item for (item, total) in totals.items()}
    
    
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings


load_movies() # execute #print movies.items()[0] #print movies["29"]
load_movie_ratings() #to_json(prefs, "prefs")

#report ground truth vs recommended movies
def get_report(current_person = "671", sim_method_id=1): 
    current_dict = prefs[current_person]
    ground_truth_len = int(len(current_dict) * 0.2)

    #for index, (key, value) in enumerate(sorted(current_dict.items())): print index, key, value

    Ground_Truth = {k: current_dict[k] for k in sorted(current_dict.keys())[-ground_truth_len:]}
    #for index, (key, value) in enumerate(sorted(Ground_Truth.items())): print index, key, value

    current_training_set = {k: current_dict[k] for k in sorted(current_dict.keys())[:len(current_dict)-ground_truth_len]}
    prefs[current_person] = current_training_set
    #for index, (key, value) in enumerate(sorted(current_training_set.items())): print index, key, value
    #for index, (key, value) in enumerate(sorted(prefs[current_person].items())): print index, key, value

    #print top_matches_similar_users(prefs, "1", 10)

    similarity.set_sim_method(sim_method_id)
    #result = [x for x in list_a if x[0] in list_b] 

    movie_recommendations = {name : rating for (rating, name) in get_recommendations_user_based_filtering(prefs, current_person)}
    
    #similarity.to_csv(movie_recommendations, "movie_recommendations.csv", "user_id, name, score", current_person)
                      
    #for key in sorted(Ground_Truth): print key, Ground_Truth[key], " vs " , movie_recommendations.get(key, -1) , " predicted\n"
    variance = similarity.Euclidean_Distance_Score(Ground_Truth, movie_recommendations)
    prefs[current_person] = current_dict #restore
    print variance
    return variance #smaller better 0 means exact prediction
    
#lambda is powerful 
#for index, key in enumerate(sorted(prefs.keys(),key=lambda x: float(x))): print index+1, key

#evaluate ecludian user_based all users
ecludian = {index+1 : get_report(key, 1) for index, key in enumerate(sorted(prefs.keys(),key=lambda x: float(x)))}
#print ecludian
#similarity.to_json(ecludian,"ecludian")
#similarity.to_csv(ecludian, "ecludian.csv")
pearsons = {index+1 : get_report(key, 2) for index, key in enumerate(sorted(prefs.keys(),key=lambda x: float(x)))}
print similarity.Euclidean_Distance_Score(ecludian, pearsons)
    
similarity.set_sim_method(2)
prefs_movies = transform(prefs)
sss = top_similar_matches(prefs_movies, "Matrix, The (1999)", 10)
#print "\n".join([str(v)+": "+str(i) for i,v in sss])
#print "\nMovies 2:\n"
#print get_recommendations_user_based_filtering(prefs_movies, "Matrix, The (1999)")[0:10]

#collaborative filtering methods: item-based

0.257898202471
0.19970072921
0.334732437775
0.134175844536
0.266183295518
0.285225912354
0.198615719919
0.21167403962
0.299508526332
0.296864177107
0.242665595751
0.146135922245
0.292510788555
0.440750416467
0.0361664266316
0.38323754336
0.0960360422443
0.334450971018
0.143164307955
0.210422982252
0.231098406348
0.144221031267
0.0946003188875
0.521780381305
0.436984655564
0.122366543038
0.461639941581
0.205868898011
0.425699702833
0.0808542257497
0.357853984006
0.279577969033
0.146172486311
0.164013529822
0.177405329158
0.210837313946
0.25539679299
0.296307848579
0.229042259046
0.436984655564
0.160275038146
0.368769055369
0.204617884133
0.374485647402
0.539257872105
0.250208623152
0.31751892984
0.13594160997
0.207651681834
0.37037037037
0.288536535667
0.199900081174
0.144945339689
0.323326909363
0.360018001603
0.0862349545575
0.145088017351
0.139154148948
0.138811174919
0.335297870362
0.137001196981
0.34084232138
0.163331988508
0.43624596766
0.261809271363
0.288031588818
0.244190851761

0.174529407401
0.327672415224
0.219390169377
0.130465985287
0.36448998893
0.250208623152
0.256865024192
0.12919599458
0.134950067454
0.182968464221
0.265926699149
0.147962438419
0.168705927745
0.260963027308
0.043596466265
0.23871796552
0.286864976628
0.311178041569
0.228190316576
0.192971784916
0.114270970718
0.275581706535
0.273022231714
0.276745603615
0.16136542038
0.0726222400945
0.230938073446
0.124589184789
0.147715970725
0.13968951086
0.0803981687349
0.173162135529
0.135666283771
0.459826348545
0.130786459563
0.151699983879
0.164172103714
0.134921666436
0.305049439136
0.446235860369
0.313841563732
0.247239113607
0.132067075616
0.220825829945
0.225547446031
0.128532755016
0.125937271142
0.1557328097
0.129619160483
0.121673364299
0.323326909363
0.340542426583
0.193733480964
0.208219217561
0.271910140648
0.0902431736754
0.223648053533
0.124281101993
0.147697065588
0.0793379436204
0.222151712256
0.254066095955
0.295755070655
0.300666818577
0.129282761411
0.126721851185
0.34235586291

0.168937480625
0.203127800499
0.115705339169
0.121951219512
0.244577171229
0.192087966984
0.158706395287
0.377035355021
0.230300392666
0.359291490832
0.462557153892
0.11124252642
0.350296597991
0.156772701165
0.145141636362
0.243615155006
0.431203507875
0.559471612102
0.348982181701
0.253518431925
0.381877410879
0.463481441997
0.178919120749
0.187046549045
0.0660918283844
0.172320977952
0.430500874043
0.315661713208
0.386011354287
0.0593219206714
0.282354030298
0.389341585025
0.124346918226
0.0727257171149
0.26303432777
0.109394193854
0.292865278164
0.22257587851
0.168045832318
0.212949741115
0.0708204593485
0.355039222808
0.286041452174
0.187897995305
0.0804264399418
0.174465042811
0.292687850149
0.0891065135278
0.381427898173
0.220343854392
0.276307196372
0.138609760109
0.103883981955
0.123866792048
0.275437379661
0.165268972651
0.383694835709
0.471143138585
0.163175990592
0.176932235458
0.355039222808
0.294478841802
0.200250509
0.155243171764
0.195564871874
0.172693203955
0.26892886