## Importing modules

In [1]:
import pandas as pd
import numpy as np
from math import sqrt

#### 0 - Explorando os dados

In [3]:
filmes = pd.read_csv('movies.csv')
filmes.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
filmes = filmes[['movieId', 'title']]
filmes.columns = ['id_filme', 'titulo']
filmes

Unnamed: 0,id_filme,titulo
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [5]:
# alguns filmes com ids diferente
filmes.titulo.value_counts() 

Eros (2004)                               2
Emma (1996)                               2
Confessions of a Dangerous Mind (2002)    2
War of the Worlds (2005)                  2
Saturn 3 (1980)                           2
                                         ..
Topo, El (1970)                           1
The Great Hypnotist (2014)                1
Verdict, The (1982)                       1
Jackass Number Two (2006)                 1
Quantum of Solace (2008)                  1
Name: titulo, Length: 9737, dtype: int64

In [6]:
# numero de filmes na base
filmes.titulo.unique().shape[0]

9737

In [9]:
lista_filmes = ['Toy Story (1995)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)', 'Mortal Kombat (1995)','Pocahontas (1995)','Star Wars: Episode VI - Return of the Jedi (1983)','Brazil (1985)', 'Star Wars: Episode V - The Empire Strikes Back (1980)']

In [10]:
filmes = filmes.loc[filmes.titulo.isin(lista_filmes)]
filmes

Unnamed: 0,id_filme,titulo
0,1,Toy Story (1995)
40,44,Mortal Kombat (1995)
44,48,Pocahontas (1995)
898,1196,Star Wars: Episode V - The Empire Strikes Back...
901,1199,Brazil (1985)
911,1210,Star Wars: Episode VI - Return of the Jedi (1983)
3638,4993,"Lord of the Rings: The Fellowship of the Ring,..."


In [12]:
avaliacoes = pd.read_csv('ratings.csv')
avaliacoes.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
avaliacoes = avaliacoes[['userId', 'movieId', 'rating']]
avaliacoes.columns = ['id_usuario', 'id_filme', 'nota_filme']
avaliacoes.head()

Unnamed: 0,id_usuario,id_filme,nota_filme
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [14]:
avaliacoes = avaliacoes.merge(filmes, on='id_filme')
avaliacoes.head()

Unnamed: 0,id_usuario,id_filme,nota_filme,titulo
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [15]:
# tamanho da tabela
avaliacoes.shape

(993, 4)

In [16]:
# numero de usuario
avaliacoes.id_usuario.unique().shape[0]

421

In [17]:
# número de filmes
avaliacoes.id_filme.unique().shape[0]

7

In [19]:
# número de classificacao por usuário
avaliacoes.id_usuario.value_counts()

380    7
414    7
274    7
534    6
599    6
      ..
225    1
224    1
223    1
222    1
511    1
Name: id_usuario, Length: 421, dtype: int64

In [20]:
# nota mínima e máxima
avaliacoes.nota_filme.min(), avaliacoes.nota_filme.max()

(0.5, 5.0)

In [21]:
# redução da quantidade de usuário
avaliacoes = avaliacoes.loc[avaliacoes.id_usuario.isin([1,7,371, 608,610])]
avaliacoes

Unnamed: 0,id_usuario,id_filme,nota_filme,titulo
0,1,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
212,608,1,2.5,Toy Story (1995)
214,610,1,5.0,Toy Story (1995)
215,1,1196,5.0,Star Wars: Episode V - The Empire Strikes Back...
217,7,1196,4.0,Star Wars: Episode V - The Empire Strikes Back...
424,608,1196,4.0,Star Wars: Episode V - The Empire Strikes Back...
425,610,1196,5.0,Star Wars: Episode V - The Empire Strikes Back...
426,1,1210,5.0,Star Wars: Episode VI - Return of the Jedi (1983)
427,7,1210,4.0,Star Wars: Episode VI - Return of the Jedi (1983)


In [23]:
# guardando a informação de lista de usuários
lista_usuario = list(avaliacoes.id_usuario.unique())
lista_usuario

[1, 7, 608, 610, 371]

#### 1 - Matriz de Score (usuário x item)

In [24]:
# matriz de avaliações usuário x itens
scores = avaliacoes.pivot_table(index = ['id_usuario'], values = ['nota_filme'], columns = ['titulo'])
scores.columns = scores.columns.droplevel(0)
scores = scores.rename_axis(None, axis=1)
scores.rename(columns={'':'id_usuario'}, inplace=True)
scores

Unnamed: 0_level_0,Brazil (1985),"Lord of the Rings: The Fellowship of the Ring, The (2001)",Mortal Kombat (1995),Pocahontas (1995),Star Wars: Episode V - The Empire Strikes Back (1980),Star Wars: Episode VI - Return of the Jedi (1983),Toy Story (1995)
id_usuario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,,5.0,5.0,4.0
7,,4.5,,,4.0,4.0,4.5
371,5.0,4.5,,4.5,,,
608,,4.5,0.5,0.5,4.0,4.0,2.5
610,4.5,5.0,,,5.0,5.0,5.0


In [25]:
scores = scores.fillna(0)
scores

Unnamed: 0_level_0,Brazil (1985),"Lord of the Rings: The Fellowship of the Ring, The (2001)",Mortal Kombat (1995),Pocahontas (1995),Star Wars: Episode V - The Empire Strikes Back (1980),Star Wars: Episode VI - Return of the Jedi (1983),Toy Story (1995)
id_usuario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.0,0.0,0.0,0.0,5.0,5.0,4.0
7,0.0,4.5,0.0,0.0,4.0,4.0,4.5
371,5.0,4.5,0.0,4.5,0.0,0.0,0.0
608,0.0,4.5,0.5,0.5,4.0,4.0,2.5
610,4.5,5.0,0.0,0.0,5.0,5.0,5.0


In [26]:
# guardando a informação de lista de itens
lista_itens = list(scores.columns)
lista_itens

['Brazil (1985)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Mortal Kombat (1995)',
 'Pocahontas (1995)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Toy Story (1995)']

In [27]:
matriz_scores = scores.to_numpy()
matriz_scores

array([[0. , 0. , 0. , 0. , 5. , 5. , 4. ],
       [0. , 4.5, 0. , 0. , 4. , 4. , 4.5],
       [5. , 4.5, 0. , 4.5, 0. , 0. , 0. ],
       [0. , 4.5, 0.5, 0.5, 4. , 4. , 2.5],
       [4.5, 5. , 0. , 0. , 5. , 5. , 5. ]])

#### 2 - Matriz de similaridade (usuário x usuário)

In [28]:
def distancia(a, b):
    zip_b = zip(*b)
    resultado = []
    quadrado_diferenca = 0

    for row_a in a:
        for col_b in zip_b:
            soma = 0
            for nota_a, nota_b in zip(row_a, col_b):
                if (nota_a !=0) & (nota_b !=0):                   
                    quadrado_diferenca = pow(nota_b-nota_a, 2)
                    soma += quadrado_diferenca
            if soma > 0:
                d = 1/(1 + sqrt(soma))
            if soma == 0:
                d = 1
            resultado.append(d)
    return resultado                                                       

In [29]:
#teste com um usuário
score_usuario_1 = scores.loc[[scores.index[0]]].to_numpy()

sim_usu1_demais = distancia(score_usuario_1, matriz_scores.T)     
sim_usu1_demais

[1, 0.4, 1, 0.3266316347104093, 0.5]

In [30]:
def similaridade(scores):
    i = 0
    for usuario in list(scores.index):
        scores_usuario = scores.loc[[usuario]].to_numpy()
        
        matriz_scores = scores.to_numpy()
        
        resultado = distancia(scores_usuario, matriz_scores.T)     
        if i == 0:
            base_similaridade = pd.DataFrame(resultado, index = list(scores.index), columns = [usuario])
        else:
            temp = pd.DataFrame(resultado, index = list(scores.index), columns = [usuario])
            base_similaridade = pd.concat([base_similaridade, temp], axis=1)
        i = i + 1
    return base_similaridade

In [31]:
base_similaridade = similaridade(scores)
base_similaridade

Unnamed: 0,1,7,371,608,610
1,1.0,0.4,1.0,0.326632,0.5
7,0.4,1.0,1.0,0.333333,0.387426
371,1.0,1.0,1.0,0.2,0.585786
608,0.326632,0.333333,0.2,1.0,0.255397
610,0.5,0.387426,0.585786,0.255397,1.0


In [32]:
matriz_similaridade = base_similaridade.to_numpy()
# resolvendo o problema da diagonal principal
np.fill_diagonal(matriz_similaridade, 0)
matriz_similaridade

array([[0.        , 0.4       , 1.        , 0.32663163, 0.5       ],
       [0.4       , 0.        , 1.        , 0.33333333, 0.38742589],
       [1.        , 1.        , 0.        , 0.2       , 0.58578644],
       [0.32663163, 0.33333333, 0.2       , 0.        , 0.25539679],
       [0.5       , 0.38742589, 0.58578644, 0.25539679, 0.        ]])

#### 3 - Recomendação

In [33]:
matriz_scores

array([[0. , 0. , 0. , 0. , 5. , 5. , 4. ],
       [0. , 4.5, 0. , 0. , 4. , 4. , 4.5],
       [5. , 4.5, 0. , 4.5, 0. , 0. , 0. ],
       [0. , 4.5, 0.5, 0.5, 4. , 4. , 2.5],
       [4.5, 5. , 0. , 0. , 5. , 5. , 5. ]])

In [34]:
soma_similares = np.dot(matriz_similaridade, matriz_scores)
soma_similares

array([[ 7.25      , 10.26984236,  0.16331582,  4.66331582,  5.40652654,
         5.40652654,  5.11657909],
       [ 6.74341649,  7.93712943,  0.16666667,  4.66666667,  5.27046277,
         5.27046277,  4.37046277],
       [ 2.63603897,  8.32893219,  0.1       ,  0.1       , 12.72893219,
        12.72893219, 11.92893219],
       [ 2.14928557,  3.67698396,  0.        ,  0.9       ,  4.24347547,
         4.24347547,  4.0835105 ],
       [ 2.92893219,  5.52874103,  0.1276984 ,  2.76373737,  5.07129072,
         5.07129072,  4.38190847]])

In [35]:
matriz_scores_aux = np.copy(matriz_scores)    
matriz_scores_aux[matriz_scores > 0] = 1
matriz_scores_aux

array([[0., 0., 0., 0., 1., 1., 1.],
       [0., 1., 0., 0., 1., 1., 1.],
       [1., 1., 0., 1., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 0., 1., 1., 1.]])

In [36]:
soma_pesos = np.dot(matriz_similaridade, matriz_scores_aux)
soma_pesos

array([[1.5       , 2.22663163, 0.32663163, 1.32663163, 1.22663163,
        1.22663163, 1.22663163],
       [1.38742589, 1.72075922, 0.33333333, 1.33333333, 1.12075922,
        1.12075922, 1.12075922],
       [0.58578644, 1.78578644, 0.2       , 0.2       , 2.78578644,
        2.78578644, 2.78578644],
       [0.45539679, 0.78873013, 0.        , 0.2       , 0.91536176,
        0.91536176, 0.91536176],
       [0.58578644, 1.22860912, 0.25539679, 0.84118323, 1.14282268,
        1.14282268, 1.14282268]])

In [37]:
recomendacao = np.around(np.divide(soma_similares, soma_pesos), decimals=2)
recomendacao

  """Entry point for launching an IPython kernel.


array([[4.83, 4.61, 0.5 , 3.52, 4.41, 4.41, 4.17],
       [4.86, 4.61, 0.5 , 3.5 , 4.7 , 4.7 , 3.9 ],
       [4.5 , 4.66, 0.5 , 0.5 , 4.57, 4.57, 4.28],
       [4.72, 4.66,  nan, 4.5 , 4.64, 4.64, 4.46],
       [5.  , 4.5 , 0.5 , 3.29, 4.44, 4.44, 3.83]])

In [38]:
# selecionar somente as notas q foram preditas
recomendacao[matriz_scores_aux == 1] = 0
recomendacao

array([[4.83, 4.61, 0.5 , 3.52, 0.  , 0.  , 0.  ],
       [4.86, 0.  , 0.5 , 3.5 , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.5 , 0.  , 4.57, 4.57, 4.28],
       [4.72, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.5 , 3.29, 0.  , 0.  , 0.  ]])

In [39]:
recomendacao = pd.DataFrame(recomendacao, index = lista_usuario, columns = lista_itens)
recomendacao

Unnamed: 0,Brazil (1985),"Lord of the Rings: The Fellowship of the Ring, The (2001)",Mortal Kombat (1995),Pocahontas (1995),Star Wars: Episode V - The Empire Strikes Back (1980),Star Wars: Episode VI - Return of the Jedi (1983),Toy Story (1995)
1,4.83,4.61,0.5,3.52,0.0,0.0,0.0
7,4.86,0.0,0.5,3.5,0.0,0.0,0.0
608,0.0,0.0,0.5,0.0,4.57,4.57,4.28
610,4.72,0.0,0.0,0.0,0.0,0.0,0.0
371,0.0,0.0,0.5,3.29,0.0,0.0,0.0


In [40]:
# trazendo as notas ja classificadas
recomendacao[matriz_scores_aux == 1] = matriz_scores
recomendacao

Unnamed: 0,Brazil (1985),"Lord of the Rings: The Fellowship of the Ring, The (2001)",Mortal Kombat (1995),Pocahontas (1995),Star Wars: Episode V - The Empire Strikes Back (1980),Star Wars: Episode VI - Return of the Jedi (1983),Toy Story (1995)
1,4.83,4.61,0.5,3.52,5.0,5.0,4.0
7,4.86,4.5,0.5,3.5,4.0,4.0,4.5
608,5.0,4.5,0.5,4.5,4.57,4.57,4.28
610,4.72,4.5,0.5,0.5,4.0,4.0,2.5
371,4.5,5.0,0.5,3.29,5.0,5.0,5.0


#### Método alternativo do Passo 2 : Matriz de similaridade (para gerar similaridade entre todos os usuários da base)

In [41]:
#metodo alternativo para rodar todo o conjunto de dados
from sklearn.metrics.pairwise import pairwise_distances 
matriz_similaridade = pairwise_distances(matriz_scores, metric='cosine')
matriz_similaridade

array([[0.        , 0.16153098, 1.        , 0.19874264, 0.21425156],
       [0.16153098, 0.        , 0.70614345, 0.02909058, 0.08965213],
       [1.        , 0.70614345, 0.        , 0.6380606 , 0.49295171],
       [0.19874264, 0.02909058, 0.6380606 , 0.        , 0.10958449],
       [0.21425156, 0.08965213, 0.49295171, 0.10958449, 0.        ]])