## Similaridade entre itens

In [1]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=027b4c775c4766d5a8cb91f1756141488c8e0cfee3e3c86463804e21fd2cbbca
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# distancia
from scipy.spatial.distance import pdist, hamming, cosine
from sklearn.feature_extraction.text import CountVectorizer
import wikipedia

In [3]:
cv = CountVectorizer()

In [4]:
def cosine_similarity(x: np.array, y: np.array):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [5]:
movies = [
    'Avatar',
    'The Matrix',
    'The Story',
    'The Dark Knight',
     'Frozen',
    'The Incredibles'
]

In [6]:
def feature_extraction(movie: list):
  wikipedia.set_lang('pt')
  contents = []
  for movie in movies:
    print('busca',movie)
    page = wikipedia.page(movie)
    #log
    print(page.url, page.title)
    contents.append(page.content)
  features = np.array(cv.fit_transform(contents).todense())

  return features

In [7]:
# extracao de features
features =feature_extraction(movies)

busca Avatar
https://pt.wikipedia.org/wiki/Avatar_(filme) Avatar (filme)
busca The Matrix
https://pt.wikipedia.org/wiki/Matrix Matrix
busca The Story
https://pt.wikipedia.org/wiki/The_Story The Story
busca The Dark Knight
https://pt.wikipedia.org/wiki/The_Dark_Knight The Dark Knight
busca Frozen
https://pt.wikipedia.org/wiki/Frozen_(2013) Frozen (2013)
busca The Incredibles
https://pt.wikipedia.org/wiki/The_Incredibles The Incredibles


In [8]:
features

array([[ 1,  0,  0, ...,  0,  0,  0],
       [ 2,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 3,  2,  0, ...,  0,  0,  0],
       [13,  0,  1, ...,  2,  1,  2],
       [ 0,  0,  0, ...,  2,  0,  0]])

In [9]:
features.shape

(6, 7463)

In [10]:
print(f"{movies[2]} - {movies[0]}:", cosine_similarity(features[2], features[0]))
print(f"{movies[2]} - {movies[1]}:",cosine_similarity(features[2], features[1]))
print(f"{movies[2]} - {movies[3]}:",cosine_similarity(features[2], features[3]))
print(f"{movies[2]} - {movies[4]}:",cosine_similarity(features[2], features[4]))
print(f"{movies[2]} - {movies[5]}:",cosine_similarity(features[2], features[5]))

The Story - Avatar: 0.23346408024439286
The Story - The Matrix: 0.2655312280971472
The Story - The Dark Knight: 0.2630322362219663
The Story - Frozen: 0.2591457200729022
The Story - The Incredibles: 0.26297817855661765


# mapeamento de perfis de usuários

In [11]:
movies_genres = {
    'Action': [1,1,0,1,0],
    'Adventure': [1,0,1,1,0],
    'Comedy': [0,0,1,0,0],
    'Drama': [0,0,0,0,1],
}
movies=['Avatar', 'Matrix', 'Toy Story', 'Batman', 'Pulp Fiction']

In [12]:
# avaliacoes de usuários
users_ratings = {
    'Avatar':1,
    'Matrix':4,
    'Toy Story':3,
    'Batman':5,
    'Pulp Fiction':2
}

In [13]:
# dataframe
movies_profile = pd.DataFrame(movies_genres, index=movies)
movies_profile

Unnamed: 0,Action,Adventure,Comedy,Drama
Avatar,1,1,0,0
Matrix,1,0,0,0
Toy Story,0,1,1,0
Batman,1,1,0,0
Pulp Fiction,0,0,0,1


In [14]:
user_profile = movies_profile.mean(axis=0)
user_profile

Unnamed: 0,0
Action,0.6
Adventure,0.6
Comedy,0.2
Drama,0.2


In [15]:
similaritary_teste = movies_profile.apply(lambda movie: cosine_similarity(movie, user_profile), axis=1)
similaritary_teste

Unnamed: 0,0
Avatar,0.948683
Matrix,0.67082
Toy Story,0.632456
Batman,0.948683
Pulp Fiction,0.223607


In [16]:
similaritary = movies_profile.values @ user_profile.values.reshape(-1,1)
similaritary

array([[1.2],
       [0.6],
       [0.8],
       [1.2],
       [0.2]])

In [17]:
# ordena as similaridades
i_sort = np.argsort(-similaritary.ravel())
i_sort

array([0, 3, 2, 1, 4])

In [18]:
# recomendacoes ordenadas
recomendations = np.array(movies)[i_sort].reshape(-1,1)
recomendations

array([['Avatar'],
       ['Batman'],
       ['Toy Story'],
       ['Matrix'],
       ['Pulp Fiction']], dtype='<U12')

Melhorar as indicacoes

In [19]:
# construindo o perfil pela media das notas normalizadas
ratings_array = np.array([users_ratings[movie] for movie in users_ratings.keys()])
ratings_array

array([1, 4, 3, 5, 2])

In [20]:
ratings_array_n = ratings_array - np.mean(ratings_array)
ratings_array_n

array([-2.,  1.,  0.,  2., -1.])

In [21]:
profile_matrix = movies_profile.values
profile_matrix

array([[1, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 1, 0],
       [1, 1, 0, 0],
       [0, 0, 0, 1]])

In [22]:
# gerando o perfil
#soma das avaliacoes normalizdas
sum = profile_matrix.T @ ratings_array_n.reshape(-1,1)
non_zero = np.sum(profile_matrix > 0, axis=0)
user_profle = sum.ravel() / non_zero
user_profle

array([ 0.33333333,  0.        ,  0.        , -1.        ])