In [1]:
import numpy as np
from sklearn.datasets import load_svmlight_file
import pandas as pd

In [3]:
rep = np.load("ZeroShot/.data/AMAZON_FASHION_5_zeroshot.npy")

In [4]:
rep.shape

(3176, 768)

In [5]:
dataset = pd.read_json("preprocess_data/.data/AMAZON_FASHION_5.json", lines=True)

In [6]:
dataset.shape

(3176, 12)

In [26]:
reviews = {i:[1,2,3] for i in range(10)}
list(reviews.values())

[[1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3],
 [1, 2, 3]]

In [15]:
rep.shape

(5269, 768)

In [27]:
list_of_lists = [[1, 2, 3], [4, 5, 6]]

print(flat_list)

[[1, 2, 3], [4, 5, 6]]


In [6]:
teste = np.load("dataset/.data/All_Beauty_data_word2vec/train/itemBertDoc.npy")

In [7]:
teste.shape

(86, 37685)

In [38]:
import logging
from typing import List

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity


def intra_list_dissimilarity(
    predicted: List[list], feature_df: pd.DataFrame, k: int
) -> float:
    """
    Computes the average intra-list dissimilarity of all recommendations.
    This metric can be used to measure diversity of the list of recommended items.
    Args:
        predicted : a list of lists
            Ordered predictions
            Example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
        feature_df: dataframe
            A dataframe with one hot encoded or latent features.
            The dataframe should be indexed by the id used in the recommendations.
        k : integer
            The number of items to be considered at the ranking
    Returns:
        The average intra-list dissimilarity for recommendations.
    """
    # asserts
    assert k > 0, f"Value k={k} is not acceptable."
    assert len(predicted[0]) > 0, "There is not any prediction."
    # fill df
    feature_df = feature_df.fillna(0)

    # get all items recommended at least once that have some features
    recs = {i for pred in predicted for i in pred}
    recs = list(recs.intersection(feature_df.index))
    # get their features
    recs_content = feature_df.loc[recs]
    recs_content = recs_content.dropna()

    # save a map for each item-id
    items_map = dict(zip(recs_content.index, np.arange(0, recs_content.shape[0])))
    # create the sparse matrix
    recs_content = sp.csr_matrix(np.array(recs_content.values, dtype=int))
    # calculate similarity scores for all items recommended
    similarity = cosine_similarity(X=recs_content, dense_output=False)
    exceptions = []

    def get_list_dissimilarity(predictions: list) -> float:

        if len(predictions) > k:
            predictions = predictions[:k]

        ild_single_user = []
        # get similarities
        for pos, i in enumerate(predictions):
            if i in items_map:
                i_index = items_map[i]
                for j in predictions[pos + 1:]:
                    if j in items_map:
                        j_index = items_map[j]
                        ild_single_user.append(1.0-similarity[i_index, j_index])
            else:
                exceptions.append(i)

        return np.mean(ild_single_user)

    # Running metric
    results = list(map(get_list_dissimilarity, predicted))
    if len(exceptions) > 0:
        logging.warning(f"The podcasts {set(exceptions)} do not have any categorical feature.")

    return np.mean(results)

In [45]:
itens = [["x", "y", "z"], ["x", "y", "z"], ["x", "y", "z"]]
features = pd.DataFrame({
    "x1": [1,0,0],
    "x2": [0,1,0],
    "x3": [0,0,1]

}, index=["x", "y", "z"])
intra_list_dissimilarity(itens, features, 3)

1.0

In [42]:
from sklearn.preprocessing import OneHotEncoder

# Reshape data to fit the encoder input

categories = [['apple'], ['orange'], ['banana']]

encoder = OneHotEncoder(sparse=False)

encoder.fit(categories)

# Transform categories

encoded_categories = encoder.transform(categories)

print(encoded_categories)

[[1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


