<a href="https://colab.research.google.com/github/dawidstajszczyk/Recommender-system/blob/main/recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Wczytanie danych

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Wczytaj dostępne filmy
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')
movies.head()



Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Wczytaj dostępne oceny
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Implementacja *user-item matrix*

In [None]:
from scipy.sparse import csr_matrix

def user_item_matrix(df):

  # Pobierz wymiary macierzy
  rows_num = df['userId'].nunique()
  columns_num = df['movieId'].nunique()

  # Pobierz unikalne ID użytkowników i filmów
  unique_users = np.unique(df["userId"])
  unique_movies = np.unique(df["movieId"])

  # Utwórz mapper dla użytkowników (which userId correspond to which row 'utility' matrix)
  user_mapper = {user_id: index for index, user_id in enumerate(unique_users)}

  # Utwórz mapper dla filmów (which movieId correspond to which column 'utility' matrix)
  movie_mapper = {movie_id: index for index, movie_id in enumerate(unique_movies)}

  # Utwórz mapper odwrotny dla użytkowników
  user_inv_mapper = {index: user_id for index, user_id in enumerate(unique_users)}

  # Utwórz mapper odwrotny dla filmów
  movie_inv_mapper = {index: movie_id for index, movie_id in enumerate(unique_movies)}

  # Pobierz indeksy użytkowników i filmów
  user_indices = [user_mapper[i] for i in df['userId']]
  item_indices = [movie_mapper[i] for i in df['movieId']]

  # Utwórz user-item matrix
  X = csr_matrix((df["rating"], (user_indices, item_indices)), shape=(rows_num, columns_num))

  return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper



Podgląd *user-item matrix*

In [None]:
# Utwórz user-item matrix (X)
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = user_item_matrix(ratings)

# Pobierz fragment macierzy X
data = X[:5, :5].toarray()

# Utwórz DataFrame na podstawie danych 'data'
df = pd.DataFrame(data)
df.index.name = 'user'
df.columns.name = 'item'

# Wyświetl fragment macierzy z opisanymi osiami
print(df)

item    0    1    2    3    4
user                         
0     4.0  0.0  4.0  0.0  0.0
1     0.0  0.0  0.0  0.0  0.0
2     0.0  0.0  0.0  0.0  0.0
3     0.0  0.0  0.0  0.0  0.0
4     4.0  0.0  0.0  0.0  0.0


**Collaborative Filtering**

In [None]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, movie_mapper, movie_inv_mapper, X, k, metric = 'cosine'):
  # Pobierz indeks wybranego filmu
  movie_index = movie_mapper[movie_id]

  # Pobierz wektor cech (ocen) dla wybranego filmu
  X = X.T
  movie_vector = X[movie_index]

  # Jeśli movie_vector jest tablicą numpy, spłasz ją do jednego wymiaru
  if isinstance(movie_vector, (np.ndarray)):
    movie_vector = movie_vector.reshape(1,-1)


  # Zainicjuj obiekt NearestNeighbors
  kNN = NearestNeighbors(n_neighbors= k + 1, algorithm="brute", metric=metric)

  # Dopasuj model k-Nearest-Neighbours do danych
  kNN.fit(X)

  # Znajdź k najbliższych sąsiadów dla wybranego filmu
  neighbour = kNN.kneighbors(movie_vector, return_distance=False)

  # Zainicjuj listę do przechowywania indeksów najbliższych sąsiadów
  neighbour_indices = []

  # Pobierz movieId wybrane przez algorytm kNN
  for i in range(0,k):
    n = neighbour.item(i)
    neighbour_indices.append(movie_inv_mapper[n])

  # Usuń film, dla którego przeprowadzana jest rekomendacja
  neighbour_indices.pop(0)

  return neighbour_indices



Dokonanie rekomendacji

In [None]:
similar_movies = find_similar_movies(1, movie_mapper, movie_inv_mapper, X, k=10)
similar_movies

[3114, 480, 780, 260, 356, 364, 1210, 648, 1265]

In [None]:
# Pobierz listę identyfikatorów wszystkich filmów i tytułów
movie_ids = movies['movieId']
titles = movies['title']

# Zainicjuj listę do przechowywania wybranych tytułów
movie_titles = {}

# Dodaj pary movieId-title do słownika
for movie_id, title in zip(movie_ids, titles):
    movie_titles[movie_id] = title

# Znajdź filmy podobne do filmu o identyfikatorze movie_id
movie_id = 100
similar_movies = find_similar_movies(movie_id, movie_mapper, movie_inv_mapper, X, k=10)


print(f"Na podstawie filmu {movie_titles[movie_id]}:")
for movie_id in similar_movies:
    print(movie_titles[movie_id])

Na podstawie filmu City Hall (1996):
Othello (1995)
Cobb (1994)
Chamber, The (1996)
Eye for an Eye (1996)
Bloodsport 2 (a.k.a. Bloodsport II: The Next Kumite) (1996)
Bed of Roses (1996)
Dangerous Ground (1997)
Candidate, The (1972)
First Kid (1996)


In [None]:
movies["genres"].head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

**Content-based filtering**

Utworzenie macierzy *movie_genres*

In [None]:
# Utwórz zestaw wszystkich gatunków filmowych
genres = set()
for genre_list in movies['genres']:
    for genre in genre_list.split('|'):
        genres.add(genre)

# Dodaj nową kolumnę z gatunkiem 'genre' do ramki danych 'movies'.
# Wartość w tej kolumnie to 1, jeśli film należy do danego gatunku, a 0 w przeciwnym przypadku.
for genre in genres:
    movies[genre] = movies['genres'].transform(lambda x: int(genre in x))

# Utwórz kopię DataFrame'u i usuń wymienione kolumny
movie_genres = movies.drop(columns=['movieId', 'title', 'genres'])

# Podejrzyj fragment DataFrame'u
movie_genres.head()


Unnamed: 0,(no genres listed),Mystery,Fantasy,Adventure,Film-Noir,Action,Documentary,Romance,Crime,Comedy,Thriller,Drama,War,Children,Animation,Musical,IMAX,Sci-Fi,Western,Horror
0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


Utworzenie *Cosine Similarity Matrix*

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Oblicz cosine_similatiry
# cosine_sim[i, j] będzie reprezentować podobieństwo kosinusowe między i-tym a j-tym filmem.
cosine_sim = cosine_similarity(movie_genres, movie_genres)

print("Cosine similarity matrix")

# Przekształć tablicę numpy na ramkę danych DataFrame i wyświelt
df = pd.DataFrame(cosine_sim)
df

Cosine similarity matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
0,1.000000,0.774597,0.316228,0.258199,0.447214,0.000000,0.316228,0.632456,0.000000,0.258199,...,0.447214,0.316228,0.316228,0.447214,0.0,0.670820,0.774597,0.00000,0.316228,0.447214
1,0.774597,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.816497,0.000000,0.333333,...,0.000000,0.000000,0.000000,0.000000,0.0,0.288675,0.333333,0.00000,0.000000,0.000000
2,0.316228,0.000000,1.000000,0.816497,0.707107,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.353553,0.000000,0.500000,0.000000,0.0,0.353553,0.408248,0.00000,0.000000,0.707107
3,0.258199,0.000000,0.816497,1.000000,0.577350,0.000000,0.816497,0.000000,0.000000,0.000000,...,0.288675,0.408248,0.816497,0.000000,0.0,0.288675,0.333333,0.57735,0.000000,0.577350
4,0.447214,0.000000,0.707107,0.577350,1.000000,0.000000,0.707107,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.707107,0.000000,0.0,0.500000,0.577350,0.00000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.670820,0.288675,0.353553,0.288675,0.500000,0.288675,0.353553,0.000000,0.500000,0.288675,...,0.750000,0.353553,0.353553,0.500000,0.0,1.000000,0.866025,0.00000,0.707107,0.500000
9738,0.774597,0.333333,0.408248,0.333333,0.577350,0.000000,0.408248,0.000000,0.000000,0.000000,...,0.577350,0.408248,0.408248,0.577350,0.0,0.866025,1.000000,0.00000,0.408248,0.577350
9739,0.000000,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.707107,0.707107,0.000000,0.0,0.000000,0.000000,1.00000,0.000000,0.000000
9740,0.316228,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.000000,0.707107,0.408248,...,0.707107,0.500000,0.000000,0.707107,0.0,0.707107,0.408248,0.00000,1.000000,0.000000


In [None]:
def get_content_based_recommendations(title, n_recommendations = 10):

  # Utwórz słownik. Klucz - tytuł filmu, wartość - indeks filmu
  movie_idx = dict(zip(movies['title'], list(movies.index)))

  # Przypisz indeks wybranego filmu
  idx = movie_idx[title]

  # Wyodrębnij podobieństwo kosinusowe pomiędzy wybranym filmem, a pozostałymi
  sim_scores = list(enumerate(cosine_sim[idx]))

  # Sortuj malejąco listę 'sim_scores'
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Wybierz 'n_recommendations' podobnych filmów do wybranego
  sim_scores = sim_scores[1:(n_recommendations+1)]

  # Utwórz listę z indeksami wybranych filmów
  similar_movies = [i[0] for i in sim_scores]

  # Wyświetl filmy wraz z indeksami.
  print(f"Because you watched {title}:")
  print(movies['title'].iloc[similar_movies])


In [None]:
get_content_based_recommendations('Toy Story (1995)', 10)

Because you watched Toy Story (1995):
1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object
