In [1]:
import pandas as pd
import os

os.chdir('..')

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from transformers import AutoTokenizer
from src.predictors.contrastive import ContrastivePretrainModel, ContrastiveClassifierConfig
from src.utils import load_as_object
import torch

In [4]:
test_offers_df = pd.read_csv('data/processed/contrastive/wdc_computers_medium/pretrain-test.csv')
test_offers_df.head(10)

Unnamed: 0.1,Unnamed: 0,text,cluster_id
0,804,"[COL] title [VAL] null , 154457 b21 hp wide ul...",10805188
1,658,[COL] title [VAL] asus sabertooth x99 [COL] de...,390554
2,31,[COL] title [VAL] 643778 b21 hp xeon e78867 2 ...,9313979
3,448,[COL] title [VAL] amd a6 7400k 3 5ghz socket f...,610055
4,1018,[COL] title [VAL] kingston 64gb datatraveler s...,156285
5,354,[COL] title [VAL] apple mac mini 2 8ghz intel ...,1464841
6,751,"[COL] title [VAL] null , 432094 b21 hp 146 gb ...",267367
7,817,"[COL] title [VAL] null , 647909 b21 hp 8gb 1x8...",13847889
8,389,[COL] title [VAL] hp probook 640 g3 i5 7200u 8...,713835
9,640,[COL] title [VAL] acer aspire es1 132 p194 bus...,8708940


In [None]:
# tokenizer(test_offers_df['text'].iloc[0]).tokens()

In [5]:
from typing import List


def encode_offer(value: str, tokenizer, bert) -> List[int]:
    tokens = tokenizer(value, return_tensors="pt", max_length=config.max_tokens, truncation=True)
    encoding = bert(tokens['input_ids'], attention_mask=tokens['attention_mask'])
    return encoding.last_hidden_state[0][0].tolist()

In [8]:
# with torch.no_grad():
#     test_offers_df['embedding'] = test_offers_df['text'].apply(encode_offer)

In [38]:
# with torch.no_grad():
#     result = encode_offer(test_offers_df['text'].iloc[0])
#     print(len(result))

torch.Size([1, 128, 768])
128


In [6]:
import numpy as np

print(f'Installed numpy version {np.__version__}')

Installed numpy version 1.21.6


In [7]:
import umap.umap_ as umap
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def project2d(target: pd.DataFrame, tokenizer, model: ContrastivePretrainModel) -> pd.DataFrame:
    df = target.copy()
    with torch.no_grad():
        df['embedding'] = df['text'].apply(lambda v: encode_offer(v, tokenizer, model.transformer))

    embeddings_list = df['embedding'].tolist()
    embeddings = np.asarray(embeddings_list, dtype='float')
    scaler =  MinMaxScaler()
    features = scaler.fit_transform(embeddings)
    mapper = umap.UMAP(n_components=2, metric="cosine").fit(features) # TODO: fit once use it for all the epochs

    projection_df = pd.DataFrame(mapper.embedding_, columns=['X', 'Y'])
    projection_df['X'] = scaler.fit_transform(projection_df[['X']])
    projection_df['Y'] = scaler.fit_transform(projection_df[['Y']])
    projection_df['label'] = df['cluster_id']
    return projection_df

In [8]:
import mplcursors
import matplotlib.pyplot as plt
%matplotlib notebook

In [10]:
from distinctipy import distinctipy

train_offers_df = pd.read_csv('data/processed/contrastive/wdc_computers_medium/pretrain-train.csv')
df_emb = train_offers_df.copy()
color_palette = distinctipy.get_colors(50)

df_labels = df_emb[['cluster_id']].drop_duplicates()
color_repeat_count = len(df_labels) // len(color_palette) + 1
color_palette = color_palette * color_repeat_count
color_palette = color_palette[:len(df_labels)]
df_labels['color'] = color_palette
df_labels = df_labels.sample(n=20)

df_sampled_emb = df_emb.join(df_labels.set_index('cluster_id'), on='cluster_id', rsuffix='_c', how="right")
df_sampled_emb = df_sampled_emb.reset_index()
print(len(df_sampled_emb))

99


In [10]:
import glob

checkpoints = glob.glob("output/contrastive_frozen_wdc-computers-medium/pretrain/checkpoint-[0-9]*")
checkpoints = sorted(checkpoints, key=lambda x: int(x.split('/')[-1].split('-')[-1]))

config: ContrastiveClassifierConfig = load_as_object(
    "configs/model_train/contrastive/frozen_no-aug_wdc-computers-medium.json",
    ContrastiveClassifierConfig.parse_obj)
pretrained_tokenizer = AutoTokenizer.from_pretrained(config.transformer_name,
                                                     additional_special_tokens=('[COL]', '[VAL]'))

figures_dir = os.path.join('output', 'figures', 'contrastive_frozen_wdc-computers-medium')
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)

for checkpoint in checkpoints:
    checkpoint_model = ContrastivePretrainModel(len_tokenizer=len(pretrained_tokenizer), model=config.transformer_name)

    model_state = torch.load(os.path.join(checkpoint, 'pytorch_model.bin'))
    checkpoint_model.load_state_dict(model_state)
    checkpoint_model.to(torch.device('cpu'))


    df_emb = project2d(df_sampled_emb, tokenizer=pretrained_tokenizer, model=checkpoint_model)
    df_colored_emb = df_emb.set_index('label').join(df_labels.set_index('cluster_id'), rsuffix='_c')
    df_colored_emb = df_colored_emb.reset_index().rename(columns={'index': 'label'})

    fig, ax = plt.subplots()
    sc = ax.scatter(df_colored_emb['X'], df_colored_emb['Y'], c=df_colored_emb['color'])
    # cursor = mplcursors.cursor(sc, hover=True)
    # cursor.connect("add", lambda sel: sel.annotation.set_text(df_colored_emb['label'].loc[sel.index]))
    checkpoint_path = checkpoint.split('/')
    plt.savefig(os.path.join(figures_dir, f'{checkpoint_path[-1]}.png'))
    print(f'Saved {checkpoint_path[-1]}')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
from PIL import Image

figures_dir = os.path.join('output', 'figures', 'contrastive_frozen_wdc-computers-medium')
frame_paths = glob.glob(os.path.join(figures_dir, '*.png'))
frame_paths = sorted(frame_paths, key=lambda x: int(x.split('/')[-1].split('.')[0].split('-')[-1]))

frames = [Image.open(i) for i in frame_paths]
frames[0].save(os.path.join(figures_dir, 'all.gif'), format='GIF', append_images=frames[1:], save_all=True, duration=500, loop=0)