In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

In [2]:
# Ratings Matrix
movie_names = ['Avatar', 'Lord of the Rings', 'Saw', 'Star Wars', 'Titanic', 'The Hangover']
user_names = ['A', 'B', 'C', 'D', 'E']
ratings = pd.DataFrame(
    index=movie_names, 
    columns=user_names,
    data=[
        [None, None, 7, None, 10],
        [9, None, 6, None, None],
        [1, 10, None, 3, None],
        [9, None, None, 5, 10],
        [None, 4, 10, None, 3],
        [5, 2, None, 9, 6]
    ],
    dtype=np.float64
).transpose()
ratingsT = ratings.transpose()
ratingsT

Unnamed: 0,A,B,C,D,E
Avatar,,,7.0,,10.0
Lord of the Rings,9.0,,6.0,,
Saw,1.0,10.0,,3.0,
Star Wars,9.0,,,5.0,10.0
Titanic,,4.0,10.0,,3.0
The Hangover,5.0,2.0,,9.0,6.0


In [3]:
# Jaccard Similarity aller User
jaccard_similarities = squareform(1-pdist(ratings > 0, 'jaccard'))
pd.DataFrame(jaccard_similarities.round(2), index=user_names, columns=user_names)

Unnamed: 0,A,B,C,D,E
A,0.0,0.4,0.17,0.75,0.33
B,0.4,0.0,0.2,0.5,0.4
C,0.17,0.2,0.0,0.0,0.4
D,0.75,0.5,0.0,0.0,0.4
E,0.33,0.4,0.4,0.4,0.0


In [4]:
# Cosine Similarity aller User
cosine_similarities = squareform(1-pdist(ratings.fillna(0), 'cosine'))
pd.DataFrame(cosine_similarities.round(2), index=user_names, columns=user_names)

Unnamed: 0,A,B,C,D,E
A,0.0,0.13,0.29,0.63,0.56
B,0.13,0.0,0.27,0.41,0.14
C,0.29,0.27,0.0,0.0,0.47
D,0.63,0.41,0.0,0.0,0.62
E,0.56,0.14,0.47,0.62,0.0


In [5]:
def pearson_correlation(a, b):
    m = np.isnan(a) | np.isnan(b)
    ma = np.ma.array(a, mask=m)
    mb = np.ma.array(b, mask=m)

    mean_a = np.nanmean(a)
    mean_b = np.nanmean(b)

    nominator = np.ma.dot((ma - mean_a), (mb - mean_b))
    denominator1 = np.ma.sqrt(np.ma.dot((ma - mean_a), (ma - mean_a)))
    denominator2 = np.ma.sqrt(np.ma.dot((mb - mean_b), (mb - mean_b)))
    denominator = denominator1 * denominator2
    return nominator / denominator

In [6]:
# Perason Correlation aller User
pearson_correlations = squareform(pdist(ratings, pearson_correlation))
user_sim = pd.DataFrame(pearson_correlations.round(2), index=user_names, columns=user_names)
user_sim

  dm[k] = metric(X[i], X[j], **kwargs)


Unnamed: 0,A,B,C,D,E
A,0.0,-0.68,-1.0,0.31,0.99
B,-0.68,0.0,-1.0,-0.96,0.62
C,-1.0,-1.0,0.0,,-0.96
D,0.31,-0.96,,0.0,-0.58
E,0.99,0.62,-0.96,-0.58,0.0


In [7]:
# Perason Correlation aller Objekte
item_sim = pd.DataFrame(squareform(pdist(ratingsT, pearson_correlation)).round(2), index=movie_names, columns=movie_names)
item_sim

  dm[k] = metric(X[i], X[j], **kwargs)


Unnamed: 0,Avatar,Lord of the Rings,Saw,Star Wars,Titanic,The Hangover
Avatar,0.0,1.0,,1.0,-0.97,1.0
Lord of the Rings,1.0,0.0,-1.0,1.0,-1.0,-1.0
Saw,,-1.0,0.0,0.1,-1.0,-0.68
Star Wars,1.0,1.0,0.1,0.0,-1.0,-0.75
Titanic,-0.97,-1.0,-1.0,-1.0,0.0,0.4
The Hangover,1.0,-1.0,-0.68,-0.75,0.4,0.0


In [8]:
# Prediction für 'Star Wars' für User 'C' anhand der 2 ähnlichsten Filme von Star Wars
user = 'C'
item = 'Star Wars'

In [9]:
# Ähnliche Objekte und deren Ähnlichkeit ermitteln
most_similar_items = item_sim[item].sort_values(ascending=False)[:2]
most_similar_items

Avatar               1.0
Lord of the Rings    1.0
Name: Star Wars, dtype: float64

In [10]:
# Bewertungen der ähnlichen Objekte durch User
ratingsT[user][most_similar_items.index]

Avatar               7.0
Lord of the Rings    6.0
Name: C, dtype: float64

In [11]:
# Gewichteter Durchschnitt der Bewertungen, mit den Ähnlichkeiten als Gewichte
np.average(ratingsT[user][most_similar_items.index], weights=most_similar_items)

6.5