In [None]:
from os import path
import pandas as pd
from config import SIMILARITY_PATH, EMBEDDING_PATH, PRETRAIN_OUTPUT_PATH
from emb_extr_res.emb_extr_res import get_embeddings_df, get_pairs_similarity_df, get_pretrain_agg_similarity
from load_data.wdc.load_wdc_dataset import EnglishDatasetLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

In [None]:
# paths to results
test_embeddings_path = path.join(EMBEDDING_PATH, r'test_embeddings.csv')
train_embeddings_path = path.join(EMBEDDING_PATH, r'train_embeddings.csv')

test_similarity_path = path.join(SIMILARITY_PATH, 'test_similarity.csv')
train_similarity_path = path.join(SIMILARITY_PATH, 'train_similarity.csv')

pretraining_output_path = path.join(PRETRAIN_OUTPUT_PATH, 'similarity_evaluation_test_evaluation_results.csv')

dataset_type = "cameras"
dataset_size = "medium"

In [None]:
embedding_train_df = get_embeddings_df(train_embeddings_path)
embedding_train_df

In [None]:
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)

In [None]:
def prepare_brands_list(train_df, brands_to_drop):
    brands_ = train_df["brand_left"].unique().tolist()
    brands_.extend(train_df["brand_right"].unique().tolist())

    brands_ = [i for i in brands_ if i is not None]
    brands = []
    for brand in brands_:
        brs = brand.split()
        brs = [x.replace('"', '').replace("'", "") for x in brs]
        brands.extend(brs)

    brands = list(set(brands))
    brands = [el for el in brands if el not in brands_to_drop]
    return brands


    

In [None]:
brands_to_drop = [',', 'd','memory',  'photo', 'co', 'usa',  'power',  'digital', 'camera', 'cam',  'hd',  'a',  'inc',  'le',  'film',  'case',  'pro', 'cameras']
brands = prepare_brands_list(train_df, brands_to_drop)

In [None]:
train_df_left = train_df[["id_left", "title_left"]]
train_df_right = train_df[["id_right", "title_right"]]
train_df_left = train_df_left.drop_duplicates().rename({"id_left" : "id", "title_left" : "title"}, axis = 'columns')
train_df_right = train_df_right.drop_duplicates().rename({"id_right" : "id", "title_right" : "title"}, axis = 'columns')
df_train_all = pd.concat([train_df_right, train_df_left])
df_train_titles = df_train_all.groupby("id").first().reset_index()


In [None]:
def prepare_probing_len(df, train_embeddings_path):

    df["nr_of_chars"] = df["title"].apply(lambda x : len(x))
    df["nr_of_words"] = df["title"].apply(lambda x : len(x.split()))

    nr_of_words_bins = [0, 10, 15, 20, 100]
    nr_of_chars_bins = [0, 50, 75, 100, 500]
    df['nr_of_chars_bins'] = pd.cut(x=df['nr_of_chars'], bins=nr_of_chars_bins, labels=[0, 1, 2, 3])

    df['nr_of_words_bins'] = pd.cut(x=df['nr_of_words'], bins=nr_of_words_bins, labels=[0, 1, 2, 3])

    

    embedding_train_df_all = get_embeddings_df(train_embeddings_path)

    probing_df_chars = pd.merge(df[["id", "nr_of_chars_bins" ]], embedding_train_df_all, left_on = "id", right_on = 'id')
    probing_df_chars = probing_df_chars.rename({"nr_of_chars_bins" : "label"}, axis=1)

    probing_df_words = pd.merge(df[["id", "nr_of_words_bins" ]], embedding_train_df_all, left_on = "id", right_on = 'id')
    probing_df_words = probing_df_words.rename({"nr_of_words_bins" : "label"}, axis=1)

    return probing_df_chars, probing_df_words


In [None]:
probing_df_chars, probing_df_words = prepare_probing_len(df_train_titles, train_embeddings_path)

In [None]:
# probing_df_words.to_csv(r'probing_tasks\dataset\probing_df_words.csv')
# probing_df_chars.to_csv(r'probing_tasks\dataset\probing_df_chars.csv')

In [None]:
def drop_brands(title, brands):
    for brand in brands:
        title  = title.replace(brand, '')
    return title

In [None]:
def prepare_new_dataset(train_df, brands):
    ids = []
    ids.extend(train_df["id_left"].unique().tolist())
    ids.extend(train_df["id_right"].unique().tolist())
    ids = np.array(list(set(ids)))

    remove_brand_mask = np.random.choice([True, False], size =len(ids))
    
    ids_removed_brands = ids[remove_brand_mask]   

    train_df1 = train_df.copy()


    id_remove_left =  train_df1["id_left"].isin(ids_removed_brands).values

    train_df1.loc[id_remove_left, "title_left"] = train_df1.loc[id_remove_left, :].apply(lambda x: drop_brands(x.title_left, brands), axis=1)

    id_remove_right =  train_df1["id_right"].isin(ids_removed_brands).values

    train_df1.loc[id_remove_right, "title_right"] = train_df1.loc[id_remove_right, :].apply(lambda x: drop_brands(x.title_right, brands), axis=1)

    train_df1["changed"] = False
    train_df1["changed"] = train_df1["id_left"].isin(ids_removed_brands)
    train_df1["changed"] = train_df1.apply(lambda x: True if x["id_right"] in (ids_removed_brands) else x["changed"], axis=1)    

    
    return train_df1, ids_removed_brands

In [None]:
new_dataset, ids_removed_brands = prepare_new_dataset(train_df, brands)

probing_task_df = new_dataset[new_dataset["changed"] == True].drop("changed", axis=1)
# probing_task_df.to_csv("datasets/df_removed_brands1.csv", index=False)

In [None]:
def brands_in_title_check(dataset, brands):
    train_df_left = dataset[["id_left", "title_left"]]
    train_df_right = dataset[["id_right", "title_right"]]
    train_df_left = train_df_left.drop_duplicates().rename({"id_left" : "id", "title_left" : "title"}, axis = 'columns')
    train_df_right = train_df_right.drop_duplicates().rename({"id_right" : "id", "title_right" : "title"}, axis = 'columns')
    df_train_all = pd.concat([train_df_right, train_df_left])
    df_train_titles = df_train_all.groupby("id").first().reset_index()

    df_train_titles["brand_in_title"] = df_train_titles["title"].apply(lambda x : any(ele in x for ele in brands))

    return df_train_titles

In [None]:
def words_in_title_check(dataset, key_words):
    train_df_left = dataset[["id_left", "title_left"]]
    train_df_right = dataset[["id_right", "title_right"]]
    train_df_left = train_df_left.drop_duplicates().rename({"id_left" : "id", "title_left" : "title"}, axis = 'columns')
    train_df_right = train_df_right.drop_duplicates().rename({"id_right" : "id", "title_right" : "title"}, axis = 'columns')
    df_train_all = pd.concat([train_df_right, train_df_left])
    df_train_titles = df_train_all.groupby("id").first().reset_index()

    df_train_titles["brand_in_title"] = df_train_titles["title"].apply(lambda x : any(ele in x.lower() for ele in key_words))

    return df_train_titles

In [None]:
key_words = ['camera', 'digital', 'len']
df_words = words_in_title_check(train_df, key_words)

In [None]:
embedding_train_df_all = get_embeddings_df(train_embeddings_path)

probing_df_key_words = pd.merge(df_words[["id", "brand_in_title" ]], embedding_train_df_all, left_on = "id", right_on = 'id')
probing_df_key_words = probing_df_key_words.rename({"brand_in_title" : "label"}, axis=1)
probing_df_key_words = probing_df_key_words.drop("id", axis=1)

In [None]:
# probing_df_key_words.to_csv(r'probing_tasks\dataset\probing_df_key_words.csv', index=False)

In [None]:
brands_in_title_df = brands_in_title_check(new_dataset, brands)
deleted_ids = brands_in_title_df[brands_in_title_df["brand_in_title"]==True]["id"].values

embedding_train_df = get_embeddings_df(path.join(EMBEDDING_PATH, r'train_embeddings_removed_brands1.csv'))
embedding_train_df_all = get_embeddings_df(train_embeddings_path)

new_emb = embedding_train_df_all[~embedding_train_df_all["id"].isin(ids_removed_brands)] 
new_emb1 = embedding_train_df[embedding_train_df["id"].isin(ids_removed_brands)]
new = pd.concat([new_emb1, new_emb])
new["label"] = new["id"].isin(deleted_ids)



In [None]:
# new.to_csv(r'probing_tasks\dataset\train_brand_names.csv', index=False)

# senetnce length

In [None]:
probing_df_words = pd.read_csv(r'probing_tasks\dataset\probing_df_words.csv')
probing_df_words = probing_df_words.drop(["Unnamed: 0", "id"], axis=1)

In [None]:
probing_df_words_X, probing_df_words_y = probing_df_words.drop(["label"], axis=1), probing_df_words["label"]
X_train, X_test, y_train, y_test = train_test_split(probing_df_words_X, probing_df_words_y, test_size=0.15)

In [None]:
clf = LogisticRegression(multi_class="multinomial", random_state=42, penalty = 'l1', solver="saga")

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
clf = XGBClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:

brand = [67, 64.6,  65.3]
brand_classsif = ["LogisticRegression", "RandomForest" ,"XGB"]

plt.bar(brand_classsif, brand, color = [(30/255, 57/255, 240/255, 0.70) , (30/255, 57/255, 240/255, 0.60), (30/255, 57/255, 240/255, 0.55)])
plt.ylim([0,100])
plt.ylabel("accuracy")

plt.show()

# brand

In [None]:
probing_df_brands = pd.read_csv(r'probing_tasks\dataset\train_brand_names.csv')

In [None]:
probing_df_brands = probing_df_brands.drop(["id"], axis=1)

X, y = probing_df_brands.drop(["label"], axis=1), probing_df_brands["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
clf = LogisticRegression(penalty="l1", solver="liblinear")

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
brand = [76.07, 72.00, 75.83]
brand_classsif = ["LogisticRegression", "RandomForest" ,"XGB"]


plt.bar(brand_classsif, brand, color = [(30/255, 57/255, 240/255) , (84/255, 117/255, 232/255), (68/255, 103/255, 227/255)])
plt.ylim([0,100])
plt.ylabel("accuracy")

plt.show()

In [None]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
clf = XGBClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
probing_df_key_words = probing_df_words = pd.read_csv(r'probing_tasks\dataset\probing_df_key_words.csv')
X, y = probing_df_key_words.drop(["label"], axis=1), probing_df_key_words["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
clf = LogisticRegression(penalty="l1", solver="liblinear")

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
clf = XGBClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_true=y_test, y_pred=predictions)

In [None]:
brand = [82.29, 77.03, 79.66]
brand_classsif = ["LogisticRegression", "RandomForest" ,"XGB"]

plt.bar(brand_classsif, brand, color = [(30/255, 57/255, 240/255) , (84/255, 117/255, 232/255), (68/255, 103/255, 227/255)])
plt.ylim([0,100])
plt.ylabel("accuracy")

plt.show()

# SIMILARITIES

In [None]:
train_df = EnglishDatasetLoader.load_train(dataset_type, dataset_size)
res = train_df[["id_left", "id_right", "label"]]
res = res.rename({"id_left" : "left_id", "id_right":"right_id"}, axis = 1)

sim_train_df = get_pairs_similarity_df(train_similarity_path)
sim_train_df["right_id"] = pd.to_numeric(sim_train_df["right_id"])
sim_train_df["left_id"] = pd.to_numeric(sim_train_df["left_id"])
sim_df = pd.merge(res, sim_train_df, on =  ["left_id", "right_id"])

In [None]:
sns.histplot(sim_df["cosine_score"][sim_df['label']==0], color = "blue", label = "label = 0")
sns.histplot(sim_df["cosine_score"][sim_df['label']==1], color = "orange", alpha = 0.5, label = "label = 1")
plt.legend()
plt.title("")
plt.show()