In [1]:
#Playing with coo matrix
import pandas as pd
from scipy.sparse import coo_matrix
import numpy as np


In [2]:
def normalize(x:pd.Series):
    #print(x.dtype)
    #print(x)
    x = x.astype(float)
    x_sum = x.sum()
    #Pego o número de valores não nulos
    x_num = x.astype(bool).sum()
    x_mean = x_sum / x_num
    #Formas de evitar a fração não ter o denominador igual a zero
    if x_num == 1 or x.std() == 0:
        return 0.0
    return (x - x_mean) / (x.max() - x.min())

def normalize_canonical(y:pd.Series):
    x = y.astype(float).to_numpy()
    x_sum = x.sum()
    #Pego o número de valores não nulos
    x_num = x.astype(bool).sum()
    x_mean = x_sum / x_num
    #Formas de evitar a fração não ter o denominador igual a zero
    if x_num == 1 or x.std() == 0:
        return 0.0
    x[x.nonzero()] -=  x_mean
    return x

In [39]:
ser = pd.Series(['0-zero', '1-um', '1-um', '2-dois', '2-dois', '2-dois', '3-três', '3-três', '0-zero', '4-quatro'])
ser = ser.astype('category')
print(ser.cat.categories)
print(ser.cat.codes)

Index(['0-zero', '1-um', '2-dois', '3-três', '4-quatro'], dtype='object')
0    0
1    1
2    1
3    2
4    2
5    2
6    3
7    3
8    0
9    4
dtype: int8


In [22]:
# Implementing our cosine similarity
def calc_adjusted_cos_sim_nozero(normalized_ratings_item_a, normalized_ratings_item_b):
    '''Cálculo da similaridade porém levando em consideração apenas os pares onde houve avaliação. 
    '''
    #Primeiro os itens onde é zero (nao há avaliação) não entram na conta.
    # Assim, trabalhamos apenas com os valores não zerados.
    
    non_zero_indexes_of_item_a = set(np.flatnonzero(normalized_ratings_item_a))
    non_zero_indexes_of_item_b = set(np.flatnonzero(normalized_ratings_item_b))
    non_zero_indexes = list(non_zero_indexes_of_item_a & non_zero_indexes_of_item_b)
    #print(f" Selected indexes: {non_zero_indexes}")
    normalized_ratings_item_a = normalized_ratings_item_a[non_zero_indexes]
    normalized_ratings_item_b = normalized_ratings_item_b[non_zero_indexes]
    
    
    den_part_a = np.sqrt(sum(np.square(normalized_ratings_item_a)))
    #print(f"Vector of item a: {normalized_ratings_item_a}")
    #print(f"Vector of item b: {normalized_ratings_item_b}")
    num = sum(np.multiply(normalized_ratings_item_a,normalized_ratings_item_b))
    den_part_b = np.sqrt(sum(np.square(normalized_ratings_item_b)))
    #print(f"Formula: {num}/({den_part_a}.{den_part_b})")
    sim_a_b = num/(den_part_a*den_part_b)
    return sim_a_b

def cosine_sim_nonnull(coo:coo_matrix, **kargs):
    dense_output=kargs.get('dense_output',True)
    M = coo.toarray()
    Sim = np.zeros((M.shape[0], M.shape[0]))
    for item in coo.row:
        row_item = M[item, :]
        for item2 in coo.row:
            if item == item2:
                Sim[item, item2] = 1.0
            else:
                Sim[item, item2] = calc_adjusted_cos_sim_nozero(row_item, M[item2, :])
    
    if dense_output == False:
        Sim = coo_matrix(Sim)
    return Sim


In [44]:
ratings = pd.read_csv('./simple2.csv')
#Certificando que este campo é de fato Float
ratings['Rating'] = ratings['Rating'].astype(float)
#Cria a coluna avg com a Rating normalizada conforme estudamos
ratings['avg'] = ratings.groupby('UserID')['Rating'].transform(lambda x: normalize_original(x))
#Certifica que este campo é de fato Float
ratings['avg'] = ratings['avg'].astype(float)
#Transforma os campos UserID e WineID em categorias para que possam ser usados como índices na matriz esparsa
ratings['UserID'] = ratings['UserID'].astype('category')
ratings['ItemID'] = ratings['ItemID'].astype('category')
#Convertemos as avaliações em uma matriz esparsa (COOrdinate)
coo = coo_matrix((ratings['avg'],
                    (ratings['ItemID'].cat.codes.copy(),
                    ratings['UserID'].cat.codes.copy())))
pd.DataFrame(coo.toarray(),index=ratings['ItemID'].cat.categories, columns=ratings['UserID'].cat.categories)


Unnamed: 0,Elias,Helio,James,Pedro,Severino,Tarantino
Alorna,-1.333333,0.4,0.6,-0.333333,0.0,2.333333
Black Tower,-0.333333,2.4,-0.4,-0.333333,0.2,-0.666667
Cella,-1.333333,0.4,0.6,-0.333333,2.2,2.333333
Gato Negro,1.666667,-1.6,-0.4,0.666667,-0.8,-1.666667
Reservado,-0.333333,0.0,0.0,-1.333333,-0.8,-0.666667
Toro,1.666667,-1.6,-0.4,1.666667,-0.8,-1.666667


#### Overlap
![Image](./img/overlap2.png)

In [2]:
# overlap_matrix calcula a quantidade de avaliações soprepostas
coo_bool = coo.astype(bool)

overlap_matrix = coo.astype(bool).astype(int).dot(coo.transpose().astype(bool).astype(int))

pd.DataFrame(overlap_matrix.toarray(),index=ratings['ItemID'].cat.categories, columns=ratings['ItemID'].cat.categories)


NameError: name 'coo' is not defined

In [58]:
cor = cosine_sim_nonnull(coo, dense_output=False)
pd.DataFrame(cor.toarray(),index=ratings['ItemID'].cat.categories, columns=ratings['ItemID'].cat.categories)

Unnamed: 0,Alorna,Black Tower,Cella,Gato Negro,Reservado,Toro
Alorna,1.0,-0.038931,1.0,-0.871667,-0.161165,-0.809993
Black Tower,-0.038931,1.0,0.017446,-0.445255,0.579496,-0.436229
Cella,1.0,0.017446,1.0,-0.823243,-0.403351,-0.763901
Gato Negro,-0.871667,-0.445255,-0.823243,1.0,0.069018,0.958406
Reservado,-0.161165,0.579496,-0.403351,0.069018,1.0,-0.198761
Toro,-0.809993,-0.436229,-0.763901,0.958406,-0.198761,1.0


In [59]:
min_sim = 0.0
cor = cor.multiply(cor > min_sim)
pd.DataFrame(cor.toarray(),index=ratings['ItemID'].cat.categories, columns=ratings['ItemID'].cat.categories)



Unnamed: 0,Alorna,Black Tower,Cella,Gato Negro,Reservado,Toro
Alorna,1.0,0.0,1.0,0.0,0.0,0.0
Black Tower,0.0,1.0,0.017446,0.0,0.579496,0.0
Cella,1.0,0.017446,1.0,0.0,0.0,0.0
Gato Negro,0.0,0.0,0.0,1.0,0.069018,0.958406
Reservado,0.0,0.579496,0.0,0.069018,1.0,0.0
Toro,0.0,0.0,0.0,0.958406,0.0,1.0


In [60]:
# Remove similaridades que não possuem o mínimo de overlap definido
min_overlap = 2
cor = cor.multiply(overlap_matrix > min_overlap)
pd.DataFrame(cor.toarray(),index=ratings['ItemID'].cat.categories, columns=ratings['ItemID'].cat.categories)

Unnamed: 0,Alorna,Black Tower,Cella,Gato Negro,Reservado,Toro
Alorna,1.0,0.0,1.0,0.0,0.0,0.0
Black Tower,0.0,1.0,0.017446,0.0,0.579496,0.0
Cella,1.0,0.017446,1.0,0.0,0.0,0.0
Gato Negro,0.0,0.0,0.0,1.0,0.069018,0.958406
Reservado,0.0,0.579496,0.0,0.069018,1.0,0.0
Toro,0.0,0.0,0.0,0.958406,0.0,1.0


In [48]:
M1 = np.array([[1, 1, 1],[1, 0, 1], [0, 1, 0], [0, 1, 1]])
M1.dot(M1.T)

array([[3, 2, 1, 2],
       [2, 2, 0, 1],
       [1, 0, 1, 1],
       [2, 1, 1, 2]])