In [1]:
from os import path
import numpy as np
import pandas as pd

from config import EMBEDDING_PATH
from emb_extr_res.emb_extr_res import get_embeddings_df
from load_data.wdc.load_wdc_dataset import EnglishDatasetLoader

In [2]:
dataset_type = "cameras"
dataset_size = "medium"

In [3]:
#load dataset
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)
test_df = EnglishDatasetLoader.load_test(dataset_type)

In [4]:
def extract_ids_and_cluster_ids(train_df):
    """
    extract unique pairs of cluster_id and offer_id
    """
    train_df_left = train_df[["id_left", "cluster_id_left"]]
    train_df_right = train_df[["id_right", "cluster_id_right"]]
    train_df_left = train_df_left.drop_duplicates().rename({"id_left" : "offer_id", "cluster_id_left" : "product_id"}, axis = 'columns')
    train_df_right = train_df_right.drop_duplicates().rename({"id_right" : "offer_id", "cluster_id_right" : "product_id"}, axis = 'columns')
    df_train_all = pd.concat([train_df_right, train_df_left])

    return df_train_all.drop_duplicates()


def extract_most_common_ids(train_df, how_many=5):
    """
    Take products ids that occurs in most offers .
    Returns: pairs of offer_id and product_id.
    """

    id_title = extract_ids_and_cluster_ids(train_df)
    clusters, count = np.unique(id_title["product_id"], return_counts=True)

    clusters_count = []

    for i in range(len(clusters)):
        clusters_count.append((clusters[i], count[i]))

    clusters_count.sort(key = lambda x: x[1], reverse=True)

    return id_title[id_title["product_id"].isin( [tup[0] for tup in clusters_count[:how_many]])]

def get_dataset_with_labels(dataset, embeddings):
    most_common_ids = extract_most_common_ids(dataset)

    features = embeddings[embeddings["id"].isin(most_common_ids["offer_id"])]

    new_dataset = pd.merge(features, most_common_ids,left_on = "id", right_on = "offer_id",  how='left').drop(["id"], axis=1).rename({"product_id":"label"}, axis = 'columns')

    return new_dataset, most_common_ids

def get_test_dataset(test_df, test_embeddings, train_ids):
    res = extract_ids_and_cluster_ids(test_df)
    ids_needed = res[res["product_id"].isin(train_ids["product_id"])]
    features = test_embeddings[test_embeddings["id"].isin(ids_needed["offer_id"])]

    return pd.merge(features, res,left_on = "id", right_on = "offer_id",  how='left').drop(["id"], axis=1).rename({"product_id":"label"}, axis = 'columns')


In [5]:
train_embeddings_path = path.join(EMBEDDING_PATH, r'train_embeddings.csv')
train_embeddings = get_embeddings_df(train_embeddings_path)

In [6]:
train_cluster_id_dataset, ids = get_dataset_with_labels(train_df, train_embeddings)

In [7]:
test_embeddings_path = path.join(EMBEDDING_PATH, r'test_embeddings.csv')
test_embeddings = get_embeddings_df(test_embeddings_path)

In [8]:
test_cluster_id_dataset = get_test_dataset(test_df, test_embeddings, ids)

In [9]:
from utils.probing_tasks_utils import *

In [10]:
clf = RandomForestClassifier()
pred_rf, acc_rf, f_score_rf = test_probing_task(train_cluster_id_dataset, test_cluster_id_dataset, clf)

Accuracy: 1.0, f_score: 1.0


In [11]:
clf = LogisticRegression(
        multi_class="multinomial", random_state=42, penalty="l1", solver="saga"
    )
pred_lr, acc_lr, f_score_lr = test_probing_task(train_cluster_id_dataset, test_cluster_id_dataset, clf)

Accuracy: 0.16666666666666666, f_score: 0.05714285714285714


In [12]:
clf = XGBClassifier()
pred_lr, acc_lr, f_score_lr = test_probing_task(train_cluster_id_dataset, test_cluster_id_dataset, clf)

Accuracy: 1.0, f_score: 1.0
