In [38]:
def loadMovieLens():
  # Get movie titles
    movies = {}
    for line in open('u2.item'):
        (id, title) = line.split('|')[0:2]
        movies[id] = title
  # Load data
    prefs = {}
    for line in open('u2.data'):
        (user, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    print(type(prefs))
    return prefs

critics = loadMovieLens()

<class 'dict'>


In [36]:
from math import sqrt

def sim_distance(prefs, p1, p2):
    '''
    Returns a distance-based similarity score for person1 and person2.
    '''

    # Get the list of shared_items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1

    # If they have no ratings in common, return 0
    if len(si) == 0:
        return 0

    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[p1][item] - prefs[p2][item], 2)
                          for item in si])

    return 1 / (1 + sqrt(sum_of_squares))

In [37]:
def sim_pearson(prefs, p1, p2):
    '''
    Returns the Pearson correlation coefficient for p1 and p2.
    '''

    # Get the list of mutually rated items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    
    # If they are no ratings in common, return 0
    if len(si) == 0:
        return 0
    
    # Sum calculations
    n = len(si)
    
    # Sums of all the preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    # Sums of the squares
    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
    
    # Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    # Calculate r (Pearson score)
    num = pSum - sum1 * sum2 / n
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0
    r = num / den
    return r

In [39]:
sim_pearson(critics, '196', '6')

0.10772099461715749

In [40]:
def topMatches(
    prefs,
    person,
    n=5,
    similarity=sim_pearson,
):
    '''
    Returns the best matches for person from the prefs dictionary. 
    Number of results and similarity function are optional params.
    '''

    scores = [(similarity(prefs, person, other), other) for other in prefs
              if other != person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [41]:
topMatches(critics, "319", n=5)

[(1.000000000000004, '78'),
 (1.000000000000004, '641'),
 (1.0, '921'),
 (1.0, '92'),
 (1.0, '903')]

In [42]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    '''
    Gets recommendations for a person by using a weighted average
    of every other user's rankings
    '''

    totals = {}
    simSums = {}
    for other in prefs:
    # Don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs, person, other)
        # Ignore scores of zero or lower
        if sim <= 0:
            continue
        for item in prefs[other]:
            # Only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                # Similarity * Score
                totals.setdefault(item, 0)
                # The final score is calculated by multiplying each item by the
                #   similarity and adding these products together
                totals[item] += prefs[other][item] * sim
                # Sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim
    # Create the normalized list
    rankings = [(total / simSums[item], item) for (item, total) in
                totals.items()]
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

In [43]:
getRecommendations(critics, "319")

[(5.000000000000001, 'Star Kid (1997)'),
 (5.0, 'Tough and Deadly (1995)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Hearts and Minds (1996)'),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (4.999999999999999, 'Prefontaine (1997)'),
 (4.868994195797423, 'Stripes (1981)'),
 (4.854800799872227, 'Leading Man, The (1996)'),
 (4.818048370464886, 'Faust (1994)'),
 (4.76359740729648, 'Nico Icon (1995)'),
 (4.628325951461959, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.575376376972293, 'Everest (1998)'),
 (4.554323903443424, 'Ayn Rand: A Sense of Life (1997)'),
 (4.54847383764658, 'Pather Panchali (1955)'),
 (4.535729675425557, 'Crossfire (1947)'),
 (4.439481913922448, 'Wrong Trousers, The (1993)'),
 (4.429381817477084, 'Casablanca (1942)'),
 (4.407206952708314, 'Lamerica (1994)'),
 (4.400476935347094, 'Star Wars (1977)'),
 (4.393238837421909, "Schindler's List 

In [44]:
def transformPrefs(prefs):
    '''
    Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}.
    '''

    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result

In [45]:
movies = transformPrefs(critics)

In [47]:
topMatches(movies, "Bad Boys (1995)")

[(1.0000000000000018, 'Grace of My Heart (1996)'),
 (1.0, 'Wonderland (1997)'),
 (1.0, 'Wedding Gift, The (1994)'),
 (1.0, 'Walking and Talking (1996)'),
 (1.0, 'Waiting to Exhale (1995)')]

In [48]:
topMatches(movies, "Tombstone (1993)")

[(1.000000000000004, 'Great Expectations (1998)'),
 (1.0000000000000027, 'Murder, My Sweet (1944)'),
 (1.0, 'Wonderful, Horrible Life of Leni Riefenstahl, The (1993)'),
 (1.0, 'Withnail and I (1987)'),
 (1.0, 'What Happened Was... (1994)')]