In [1]:
critics = {'lias':{'lady':2.5 , 'snakes':3,'just':2.5,'superman':2.0,'you':4.5,'the':2.0},
          'gene':{'lady':1.0,'snakes':1.5,'just':2.5,'superman':3.0,'the':5.0,'you':3.5},
          'michael':{'lady':1.5,'snakes':2.5,'superman':3.0,'the':4.5},
          'claudia':{'snakes':2.5,'just':3.5,'the':3.0,'superman':1.5,'you':1.0},
          'mick':{'lady':0.5,'snakes':2.5,'just':2.5,'superman':3.0,'the':3.5,'you':4.0},
          'jack':{'lady':1.5,'snakes':2.0,'the':2.5,'superman':1.0,'you':4.5},
          'toby':{'snakes':5.0,'you':1.5,'superman':2.0}}

In [2]:
critics['lias']['lady']

2.5

In [3]:
critics['toby']['snakes']

5.0

In [4]:
critics['toby']

{'snakes': 5.0, 'superman': 2.0, 'you': 1.5}

In [5]:
from math import sqrt

In [6]:
#欧氏距离相似度
def sim_distance(prefs , person1 , person2):
    si = {}
    
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
    #两个用户没有公共之处    
    if len(si) == 0:
        return 0
    
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item] , 2)
                         for item in prefs[person1] if item in prefs[person2]])
    
    return 1/(1+sqrt(sum_of_squares))

In [7]:
sim_distance(critics , 'lias' , 'gene')

0.2025519956555797

In [8]:
#皮尔逊相似度
def sim_pearson(prefs , p1 , p2):
    si = {}
    
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1

    n = len(si)
    
    #没有共同之处
    if n == 0:
        return 1
    
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    sum1Sq = sum([pow(prefs[p1][it] , 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it] , 2) for it in si])    
    
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    num = pSum - (sum1*sum2/n)
    
    den = sqrt((sum1Sq - pow(sum1 , 2)/n)*(sum2Sq-pow(sum2,2)/n))
    
    if den == 0:
        return 0
    
    r = num/den
    
    return r

In [9]:
sim_pearson(critics , 'lias' , 'gene')

-0.05566063880844833

In [10]:
def topMaches(prefs , person , n=5 , similarity = sim_pearson):
    scores = [(similarity(prefs , person , other) , other)
             for other in prefs if other!=person]
    
    scores.sort()
    scores.reverse()
    
    return scores[:n]

In [11]:
topMaches(critics , 'toby' , n=3)

[(0.9798637100971997, 'claudia'),
 (-0.24489899998946202, 'lias'),
 (-0.36628971787912823, 'jack')]

In [12]:
#37 推荐物品

In [13]:
0.99*3


2.9699999999999998

In [14]:
def getRecommendations(prefs , person , similarity = sim_pearson):
    totals = {}
    simSums = {}
    
    for other in prefs:
        if other == person:
            continue
        
        sim = similarity(prefs , person , other)
        
        if sim<=0:
            continue #两个用户不相似
        
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item] == 0:
                #多自己没看过的电影评价
                totals.setdefault(item , 0)
                totals[item]+=prefs[other][item] * sim
                
                simSums.setdefault(item , 0)
                simSums[item] += sim
            
    rankings = [(total/simSums[item] , item) for item , total in totals.items()]
    rankings.sort()
    rankings.reverse()
    
    return rankings
        

In [15]:
critics

{'claudia': {'just': 3.5,
  'snakes': 2.5,
  'superman': 1.5,
  'the': 3.0,
  'you': 1.0},
 'gene': {'just': 2.5,
  'lady': 1.0,
  'snakes': 1.5,
  'superman': 3.0,
  'the': 5.0,
  'you': 3.5},
 'jack': {'lady': 1.5, 'snakes': 2.0, 'superman': 1.0, 'the': 2.5, 'you': 4.5},
 'lias': {'just': 2.5,
  'lady': 2.5,
  'snakes': 3,
  'superman': 2.0,
  'the': 2.0,
  'you': 4.5},
 'michael': {'lady': 1.5, 'snakes': 2.5, 'superman': 3.0, 'the': 4.5},
 'mick': {'just': 2.5,
  'lady': 0.5,
  'snakes': 2.5,
  'superman': 3.0,
  'the': 3.5,
  'you': 4.0},
 'toby': {'snakes': 5.0, 'superman': 2.0, 'you': 1.5}}

In [16]:
getRecommendations(critics , 'toby' , similarity=sim_distance)

[(3.434305876801753, 'the'),
 (2.807765960287094, 'just'),
 (1.4133198166133716, 'lady')]

### 上述的方法是userCF
### 下面的方法是itemCF

In [17]:
def transformPrefs(prefs):
    result = {}
    
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item , {})
            
            result[item][person] = prefs[person][item]
    
    return result

In [18]:
movies = transformPrefs(critics)
topMaches(movies , 'superman')
#和Superman类型相似的电影

[(0.7600261518874173, 'the'),
 (0.14877095408944582, 'you'),
 (-0.15205469203753377, 'snakes'),
 (-0.48995593493886624, 'lady'),
 (-0.7777777777777778, 'just')]

In [19]:
getRecommendations(movies , 'just')
#返回的是用户 即用户对这个电影的喜欢程度

[(5.0, 'toby'), (2.5, 'michael'), (2.0, 'jack')]

In [20]:
import pydelicious

In [21]:
pydelicious.get_popular(tag = 'programming')

<urlopen error [Errno 11001] getaddrinfo failed>, 4 tries left.
<urlopen error [Errno 11001] getaddrinfo failed>, 3 tries left.
<urlopen error [Errno 11001] getaddrinfo failed>, 2 tries left.
<urlopen error [Errno 11001] getaddrinfo failed>, 1 tries left.


PyDeliciousException: Unable to retrieve data at 'http://feeds.delicious.com/rss/popular/programming', <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
from pydelicious import get_popular , get_userposts , get_urlposts
import time

In [None]:
def initiaizeUserDict(tag , count = 5):
    user_dict = {}
    
    for p1 in get_popular(tag=tag)[0:count]:
        for p2 in get_urlposts(p1['href']):
            user = p2['user']
            user_dict[user] = {}
    
    return user_dict

In [None]:
def fillItems(user_dict):
    all_items = {}
    
    for user in user_dict:
        for i in range(3):
            try:
                posts = get_userposts(user)
                break
            except:
                print 'failed user'+user+',retrying'
                time.sleep(4)
        
        for post in posts:
            url = post['href']
            user_dict[user][url] = 1.0
            all_items[url] = 1
    
    for ratings in user_dict.values():
        for item in all_items:
            if item not in ratings:
                ratings[item] = 0.0

In [None]:
initiaizeUserDict('programming')
delusers['tsegran'] = {}
fillItems(delusers)

### item based CF

In [25]:
def calculateSimilarItems(prefs , n=10):
    result = {}
    
    itemPrefs = transformPrefs(prefs)
    
    c= 0
    
    for item  in itemPrefs:
        c+=1
        
        if c%100 == 0:
            print '%d / %d' % (c , len(itemPrefs))
        
        scores = topMaches(itemPrefs , item , n=n )#, similarity= sim_distance)
        
        result[item] = scores
    
    return result

In [26]:
itemsim = calculateSimilarItems(critics)
itemsim
#频繁执行 使物品的相似度不会过期

{'just': [(0.13245323570650439, 'snakes'),
  (0, 'lady'),
  (-0.2, 'the'),
  (-0.7777777777777778, 'superman'),
  (-0.9649012813540153, 'you')],
 'lady': [(0.6625413488689132, 'you'),
  (0.5321811563901734, 'snakes'),
  (0, 'just'),
  (-0.48995593493886624, 'superman'),
  (-0.5949966210164384, 'the')],
 'snakes': [(0.5321811563901734, 'lady'),
  (0.13245323570650439, 'just'),
  (-0.15205469203753377, 'superman'),
  (-0.4817014485235311, 'you'),
  (-0.6130060338328885, 'the')],
 'superman': [(0.7600261518874173, 'the'),
  (0.14877095408944582, 'you'),
  (-0.15205469203753377, 'snakes'),
  (-0.48995593493886624, 'lady'),
  (-0.7777777777777778, 'just')],
 'the': [(0.7600261518874173, 'superman'),
  (-0.18623556264761196, 'you'),
  (-0.2, 'just'),
  (-0.5949966210164384, 'lady'),
  (-0.6130060338328885, 'snakes')],
 'you': [(0.6625413488689132, 'lady'),
  (0.14877095408944582, 'superman'),
  (-0.18623556264761196, 'the'),
  (-0.4817014485235311, 'snakes'),
  (-0.9649012813540153, 'just')]

In [33]:
def getRecommendedItems(prefs , itemMatch , user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    
    for (item , rating) in userRatings.items():
        for (similarity , item2) in itemMatch[item]:
            
            if item2 in userRatings:
                continue
            scores.setdefault(item2 , 0)
            scores[item2] += similarity * rating
            
            totalSim.setdefault(item2 , 0)
            totalSim[item2] += similarity
    
    ranking = [(score/totalSim[item] , item) for item , score in scores.items()]
    
    ranking.sort()
    ranking.reverse()
    
    return ranking

In [34]:
getRecommendedItems(critics , itemsim , 'toby')

[(46.520732540229794, 'the'),
 (3.7953076210200622, 'lady'),
 (1.4536105836851014, 'just')]

In [36]:
critics['toby']

{'snakes': 5.0, 'superman': 2.0, 'you': 1.5}

In [37]:
itemsim['snakes']

[(0.5321811563901734, 'lady'),
 (0.13245323570650439, 'just'),
 (-0.15205469203753377, 'superman'),
 (-0.4817014485235311, 'you'),
 (-0.6130060338328885, 'the')]

In [44]:
def loadMovieLens(path = 'ml-100k/'):
    movies = {}
    for line in open(path+'u.item'):
        (id_ , title) = line.split('|')[0:2]
        movies[id_] = title
    
    
    prefs = {}
    
    for line in open(path+'u.data'):
        (user , movieid , rating , ts) = line.split('\t')
        prefs.setdefault(user , {})
        prefs[user][movies[movieid]] = float(rating)
        
    return prefs

In [46]:
prefs = loadMovieLens()
prefs['87']

{'2001: A Space Odyssey (1968)': 5.0,
 'Ace Ventura: Pet Detective (1994)': 4.0,
 'Addams Family Values (1993)': 2.0,
 'Addicted to Love (1997)': 4.0,
 'Adventures of Priscilla, Queen of the Desert, The (1994)': 3.0,
 'Adventures of Robin Hood, The (1938)': 5.0,
 'Air Force One (1997)': 3.0,
 'Air Up There, The (1994)': 3.0,
 'Alien (1979)': 4.0,
 'American President, The (1995)': 5.0,
 'Annie Hall (1977)': 4.0,
 'Apocalypse Now (1979)': 4.0,
 'Babe (1995)': 5.0,
 'Baby-Sitters Club, The (1995)': 2.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 4.0,
 'Bananas (1971)': 5.0,
 'Barcelona (1994)': 3.0,
 'Batman & Robin (1997)': 4.0,
 'Batman (1989)': 3.0,
 'Batman Returns (1992)': 3.0,
 'Big Green, The (1995)': 3.0,
 'Big Squeeze, The (1996)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 4.0,
 'Blues Brothers, The (1980)': 5.0,
 'Boomerang (1992)': 3.0,
 'Boot, Das (1981)': 4.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Bridge on the River

In [47]:
getRecommendations(prefs , '87')[0:30]

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.800260666069043, 'Mrs. Dalloway (1997)'),
 (4.771240079753504, 'Leading Man, The (1996)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.707354190896574, 'Dangerous Beauty (1998)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747079, 'Casablanca (1942)'),
 (4.512903125553784, 'Four Days in September (1997)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.485151301801342, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.463287461

In [48]:
itemsim = calculateSimilarItems(prefs , n=50)
#50个推荐的电影

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


In [51]:
getRecommendedItems(prefs , itemsim , '87')[0:30]

[(5.0, 'U Turn (1997)'),
 (5.0, 'Robocop 3 (1993)'),
 (5.0, 'Ponette (1996)'),
 (5.0, 'Pather Panchali (1955)'),
 (5.0, 'Of Human Bondage (1934)'),
 (5.0, 'N\xe9nette et Boni (1996)'),
 (5.0, 'Nina Takes a Lover (1994)'),
 (5.0, 'New Jersey Drive (1995)'),
 (5.0, 'Neon Bible, The (1995)'),
 (5.0, 'Nelly & Monsieur Arnaud (1995)'),
 (5.0, 'Murder, My Sweet (1944)'),
 (5.0, 'Midnight Dancers (Sibak) (1994)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Manny & Lo (1996)'),
 (5.0, 'Man of the Year (1995)'),
 (5.0, 'Mad Dog Time (1996)'),
 (5.0, 'Love and Death on Long Island (1997)'),
 (5.0, 'Loch Ness (1995)'),
 (5.0, 'Little City (1998)'),
 (5.0, 'Line King: Al Hirschfeld, The (1996)'),
 (5.0, 'Life Less Ordinary, A (1997)'),
 (5.0, 'Letter From Death Row, A (1998)'),
 (5.0, 'Legal Deceit (1997)'),
 (5.0, 'Lay of the Land, The (1997)'),
 (5.0, 'Late Bloomers (1996)'),
 (5.0, 'Last Summer in the Hamptons (1995)'),
 (5.0, 'Johnny 100 Pesos (1993)'),
 (5.0, 'Jaws 3-D (198