In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

## Film features

In [2]:
movie_names = ['Avatar', 'Lord of the Rings', 'Saw', 'Star Wars', 'Titanic', 'The Hangover']
movie_features = ['romance', 'horror', 'fantasy', 'comedy']
movies = pd.DataFrame(
    index=movie_names, 
    columns=movie_features,
    data=[
        [1, 0, 1, 0],
        [0, 0, 1, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 1]
    ]
)
movies

Unnamed: 0,romance,horror,fantasy,comedy
Avatar,1,0,1,0
Lord of the Rings,0,0,1,0
Saw,0,1,0,0
Star Wars,0,0,1,0
Titanic,1,0,0,0
The Hangover,0,0,0,1


## Ähnlichkeits Matrix

In [3]:
# Hamming Distance
hamming_similarities = 1-pairwise_distances(movies.astype(bool).values, metric="hamming")
pd.DataFrame(hamming_similarities, index=movie_names, columns=movie_names)

Unnamed: 0,Avatar,Lord of the Rings,Saw,Star Wars,Titanic,The Hangover
Avatar,1.0,0.75,0.25,0.75,0.75,0.25
Lord of the Rings,0.75,1.0,0.5,1.0,0.5,0.5
Saw,0.25,0.5,1.0,0.5,0.5,0.5
Star Wars,0.75,1.0,0.5,1.0,0.5,0.5
Titanic,0.75,0.5,0.5,0.5,1.0,0.5
The Hangover,0.25,0.5,0.5,0.5,0.5,1.0


In [4]:
# Jaccard Ähnlichkeit
jaccard_similarities = 1-pairwise_distances(movies.astype(bool).values, metric="jaccard")
pd.DataFrame(jaccard_similarities, index=movie_names, columns=movie_names)

Unnamed: 0,Avatar,Lord of the Rings,Saw,Star Wars,Titanic,The Hangover
Avatar,1.0,0.5,0.0,0.5,0.5,0.0
Lord of the Rings,0.5,1.0,0.0,1.0,0.0,0.0
Saw,0.0,0.0,1.0,0.0,0.0,0.0
Star Wars,0.5,1.0,0.0,1.0,0.0,0.0
Titanic,0.5,0.0,0.0,0.0,1.0,0.0
The Hangover,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# Cosinus Ähnlichkeit
cosine_similarities = cosine_similarity(movies, movies)
pd.DataFrame(cosine_similarities.round(2), index=movie_names, columns=movie_names)

Unnamed: 0,Avatar,Lord of the Rings,Saw,Star Wars,Titanic,The Hangover
Avatar,1.0,0.71,0.0,0.71,0.71,0.0
Lord of the Rings,0.71,1.0,0.0,1.0,0.0,0.0
Saw,0.0,0.0,1.0,0.0,0.0,0.0
Star Wars,0.71,1.0,0.0,1.0,0.0,0.0
Titanic,0.71,0.0,0.0,0.0,1.0,0.0
The Hangover,0.0,0.0,0.0,0.0,0.0,1.0


## Ratings Matrix

In [6]:
user_names = ['A', 'B', 'C', 'D', 'E']
ratings = pd.DataFrame(
    index=movie_names, 
    columns=user_names,
    data=[
        [np.nan, np.nan, 7, np.nan, 10],
        [9, np.nan, np.nan, np.nan, np.nan],
        [1, 10, np.nan, 3, np.nan],
        [9, np.nan, np.nan, np.nan, 10],
        [np.nan, np.nan, 10, np.nan, np.nan],
        [5, np.nan, np.nan, 9, np.nan]
    ]
)
ratings

Unnamed: 0,A,B,C,D,E
Avatar,,,7.0,,10.0
Lord of the Rings,9.0,,,,
Saw,1.0,10.0,,3.0,
Star Wars,9.0,,,,10.0
Titanic,,,10.0,,
The Hangover,5.0,,,9.0,


In [7]:
# Berechnen der User-Profile mit Gewichteten Durchschnitt
user_profiles = []
for user in ratings.columns:
    user_profiles.append(np.average(movies.values, weights=ratings[user].fillna(0).values, axis=0).round(2))

users = pd.DataFrame(
    index=user_names,
    columns=movie_features,
    data = user_profiles
)
users

Unnamed: 0,romance,horror,fantasy,comedy
A,0.0,0.04,0.75,0.21
B,0.0,1.0,0.0,0.0
C,1.0,0.0,0.41,0.0
D,0.0,0.25,0.0,0.75
E,0.5,0.0,1.0,0.0


In [8]:
# Approximieren der Ratings
predicted_ratings = (cosine_similarity(movies, users)*10).round(2)
pd.DataFrame(predicted_ratings, index=movie_names, columns=user_names)

Unnamed: 0,A,B,C,D,E
Avatar,6.8,0.0,9.22,0.0,9.49
Lord of the Rings,9.62,0.0,3.79,0.0,8.94
Saw,0.51,10.0,0.0,3.16,0.0
Star Wars,9.62,0.0,3.79,0.0,8.94
Titanic,0.0,0.0,9.25,0.0,4.47
The Hangover,2.69,0.0,0.0,9.49,0.0


In [9]:
# Fehler berechnen der bekannten Ratings und den predicteten
rmse = np.sqrt(np.nanmean(np.power((predicted_ratings - ratings), 2)))
rmse

1.1072201062283704