In [1]:
CUDA_LAUNCH_BLOCKING=1

In [2]:
import os
if os.getcwd().endswith('notebooks'):
    os.chdir("..")
    
print(f'Current working directory: {os.getcwd()}')

Current working directory: /home/robert/Workspace/university/thesis-public


In [3]:
from src.preprocess.model_specific.contrastive import ContrastivePreprocessorKnownClusters, \
    ContrastivePreprocessorUnknownClusters
from src.preprocess.model_specific.word_cooc import WordCoocPreprocessor
from src.preprocess.standardize import RelationalDatasetStandardizer, WDCDatasetStandardizer

stand_path = 'configs/stands_tasks/amazon_google.json'
standardizer = RelationalDatasetStandardizer(stand_path)
standardizer.preprocess()

proc_path = 'configs/model_specific/contrastive/amazon_google.json'
preprocessor = ContrastivePreprocessorUnknownClusters(proc_path)

target_split = standardizer.config.target_location.split('/')
directory = target_split[-1] if target_split[-1] != '' else target_split[-2]
default_preproc_target = os.path.join('data', 'processed', 'contrastive', directory)

preprocessor.preprocess(original_location=standardizer.config.target_location,
                        target_location=default_preproc_target)

In [4]:
import pandas as pd
from typing import Tuple

def load_data(path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    return pd.read_csv(os.path.join(path, 'train.csv')),\
            pd.read_csv(os.path.join(path, 'valid.csv')),\
            pd.read_csv(os.path.join(path, 'test.csv'))

In [5]:
train_set, valid_set, test_set = load_data('data/processed/contrastive/amazon_google')

In [6]:
from src.predictors.contrastive import ContrastivePredictor
from src.preprocess.configs import ExperimentsArgumentParser

predictor = ContrastivePredictor(config_path='configs/model_train/contrastive/unfreeze_no-aug_batch-pt128_amazon-google.json')
predictor.config.unfreeze = False

arguments = ExperimentsArgumentParser()
arguments.parse_args("")
arguments.load_wandb_models = True
predictor.train(train_set, valid_set, arguments=arguments)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertModel: ['fit_denses.1.bias', 'fit_denses.2.bias', 'fit_denses.3.bias', 'cls.seq_relationship.bias', 'fit_denses.0.weight', 'fit_denses.0.bias', 'cls.seq_relationship.weight', 'fit_denses.3.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'fit_denses.1.weight', 'fit_denses.4.bias', 'fit_denses.4.weight', 'cls.predictions.transform.dense.bias', 'fit_denses.2.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architect

Successfully loaded trained model: ./artifacts/model-39o785p1:v0/pytorch_model.bin


In [7]:
f1 = predictor.test(test_set=test_set)
print(f'F1 score of loaded model: {f1}')

***** Running Prediction *****
  Num examples = 2293
  Batch size = 8


F1 score of loaded model: 0.842741935483871


In [8]:
f1 = predictor.test(test_set=valid_set)
print(f'F1 score on the validation set of the loaded model: {f1}')

***** Running Prediction *****
  Num examples = 2293
  Batch size = 8


F1 score on the validation set of the loaded model: 0.8192771084337349


In [10]:
from src.predictors.contrastive import ContrastiveClassificationDataset

test_dataset = ContrastiveClassificationDataset(test_set)
label_pred = predictor.trainer.predict(test_dataset)
label_pred = [v[0] for v in label_pred.predictions]
label_true = test_set['label'].tolist()

diff = [0 if label_pred[i] != label_true[i] else 1 for i in range(len(label_pred))]
print(diff)

***** Running Prediction *****
  Num examples = 2293
  Batch size = 8


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [14]:
false_negatives = [i for i in range(len(diff)) if diff[i] == 0 and label_pred[i] == 0]
false_positives = [i for i in range(len(diff)) if diff[i] == 0 and label_pred[i] == 1]

print(false_negatives)
print(false_positives)

print(len(false_negatives))
print(len(false_positives))
print(len(test_set))
print(len(train_set))

[752, 761, 886, 887, 1617, 1714, 1716, 1882, 1914, 1944, 1983, 2033, 2064, 2075, 2091, 2122, 2166, 2174, 2179, 2220, 2227, 2240, 2259, 2261, 2278]
[165, 850, 929, 1022, 1027, 1085, 1151, 1201, 1223, 1289, 1328, 1390, 1412, 1505, 1525, 1572, 1665, 1680, 1685, 1701, 1782, 1818, 1845, 1896, 1961, 1984, 2026, 2052, 2053, 2060, 2061, 2073, 2098, 2118, 2121, 2150, 2152, 2153, 2156, 2165, 2168, 2190, 2197, 2210, 2236, 2244, 2246, 2253, 2270, 2271, 2287, 2290, 2291]
25
53
2293
6874


In [17]:
import random
random_sample = random.sample(range(1, len(test_set)), 100)


In [18]:
for i in range(len(random_sample)):
    print(f'Index: {random_sample[i]}')
    print(f'Text left: {test_set.iloc[random_sample[i]]["left_text"]}')
    print(f'Text right: {test_set.iloc[random_sample[i]]["right_text"]}')
    print(f'True label: {test_set.iloc[random_sample[i]]["label"]}')
    print(f'Predicted label: {label_pred[random_sample[i]]}')
    print('===============================================')

Index: 457
Text left: [COL] title [VAL] microsoft mappoint 2006 with gps [COL] manufacturer [VAL] microsoft
Text right: [COL] title [VAL] microsoft 164-04052 [COL] manufacturer [VAL] microsoft
True label: 0
Predicted label: 0.0
Index: 103
Text left: [COL] title [VAL] adobe photoshop cs3 [ mac ] [COL] manufacturer [VAL] adobe
Text right: [COL] title [VAL] adobe indesign cs3 for mac academic [COL] manufacturer [VAL] nan
True label: 0
Predicted label: 0.0
Index: 1127
Text left: [COL] title [VAL] photo explosion deluxe 3.0 [COL] manufacturer [VAL] nova development
Text right: [COL] title [VAL] nova development corp photo explosion 3.0 [COL] manufacturer [VAL] nan
True label: 0
Predicted label: 0.0
Index: 1004
Text left: [COL] title [VAL] 3d home architect landscape design deluxe version 9 [COL] manufacturer [VAL] encore
Text right: [COL] title [VAL] punch software 26100 punch ! master landscape and home design ( small box ) [COL] manufacturer [VAL] punch software
True label: 0
Predicted la

In [61]:
raw_test = pd.read_csv('data/raw/amazon_google/test.csv')
raw_left = pd.read_csv('data/raw/amazon_google/tableA.csv')
raw_right = pd.read_csv('data/raw/amazon_google/tableB.csv')

In [68]:
raw_test.iloc[1022]

ltable_id     369
rtable_id    3007
label           0
Name: 1022, dtype: int64

In [63]:
print(raw_left.iloc[369])
print(raw_right.iloc[3007])

id                                                          369
title           tlc arthur 's kindergarten learning system 2008
manufacturer                                             encore
price                                                     19.99
Name: 369, dtype: object
id                                                    3007
title           reader rabbit reading learning system 2007
manufacturer                                           NaN
price                                                12.95
Name: 3007, dtype: object


In [66]:
processed_test = pd.read_csv('data/processed/contrastive/amazon_google/test.csv')

In [69]:
processed_test.iloc[1022]

left_text     [COL] title [VAL] videostudio 11 plus [COL] ma...
right_text    [COL] title [VAL] video studio 11 plus [COL] m...
label                                                         0
Name: 1022, dtype: object

In [22]:
interim_test = pd.read_csv('data/interim/amazon_google/test.csv')

In [23]:
interim_test.iloc[1085]

left_id                                         572
left_title            adobe dreamweaver cs3 upgrade
left_manufacturer                             adobe
right_id                                        954
right_title           adobe dreamweaver cs3 upgrade
right_manufacturer                              NaN
label                                             0
Name: 1085, dtype: object

In [75]:
raw_test[(raw_test['ltable_id'] == 572) & (raw_test['rtable_id'] == 954)]

Unnamed: 0,ltable_id,rtable_id,label
958,572,954,0


In [76]:
print(raw_left[raw_left['id'] == 572])
print(raw_right[raw_right['id'] == 954])

      id                          title manufacturer  price
572  572  adobe dreamweaver cs3 upgrade        adobe  199.0
      id                          title manufacturer   price
954  954  adobe dreamweaver cs3 upgrade          NaN  205.99


TODO: 
- extract problematic labels as csv to export to google sheets
- how much are the problematic labels affecting my f1 score?



In [19]:
mislabeled = [1022, 1085, 1151, 1201, 1289, 1328, 1390, 1505, 1665, 1782, 1818, 1896, 2026, 2053, 2060, 2061, 2121, 2152, 2156, 2165, 2246, 2270, 2271, 2291, 2088]
doubts = [1223, 1412, 1525, 1572, 1961, 2052, 2073, 2118, 2210, 1944, 1289]

mislabeled_test_set = test_set[test_set.index.isin(mislabeled)]
doubts_test_set = test_set[test_set.index.isin(doubts)]

print(len(mislabeled_test_set))
print(len(doubts_test_set))

25
11


In [32]:
mislabeled_interim = pd.merge(interim_test, mislabeled_test_set, how='inner', left_index=True, right_index=True)
mislabeled_interim.drop(['left_text', 'right_text', 'label_y'], axis=1, inplace=True)
mislabeled_interim.head()

Unnamed: 0,left_id,left_title,left_manufacturer,right_id,right_title,right_manufacturer,label_x
1022,856,videostudio 11 plus,corel,5,video studio 11 plus,corel corporation,0
1085,572,adobe dreamweaver cs3 upgrade,adobe,954,adobe dreamweaver cs3 upgrade,,0
1151,1029,westward,encore software,2081,encore software 13781 westward,,0
1201,831,rainbow fish and the whale ( win/mac ),global-software-publishing,2970,rainbow fish and the whale,,0
1289,1334,adobe creative suite cs3 web standard,adobe,566,adobe cs3 web standard,,0


In [36]:
import os

if not os.path.exists('output/data/amazong_google'):
    os.makedirs('output/data/amazon_google')

mislabeled_interim.to_csv('output/data/amazon_google/mislabeled.csv')

In [40]:
doubts_interim = pd.merge(interim_test, doubts_test_set, how='inner', left_index=True, right_index=True)
doubts_interim.drop(['left_text', 'right_text', 'label_y'], axis=1, inplace=True)
doubts_interim.to_csv('output/data/amazon_google/doubts.csv')

In [48]:
from sklearn.metrics import f1_score

f1 = f1_score(label_true, label_pred)
print(f'F1 score with original labels: {f1}')

label_corrected = label_true.copy()
for i in mislabeled: 
    label_corrected[i] = (label_true[i] + 1) % 2
    
f1_corrected = f1_score(label_corrected, label_pred)
print(f'F1 score with corrected labels: {f1_corrected}')

F1 score with original labels: 0.842741935483871
F1 score with corrected labels: 0.8944337811900192
