# 03b - Evaluate Prediction Scores of Fine-tuned T5

In [1]:
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset

from src import data, models, metrics

DATA_DIR = 'data/'
OUTPUT_DIR = 'output/t5/'
MODEL_NAME = 't5_monitors_printers_3epoch'


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda


## Create T5 model and tokenizer

In [2]:
# create model and load pre-trained checkpoint
# use fine-tuned checkpoint
t5 = models.T5(pretrained_checkpoint=OUTPUT_DIR + MODEL_NAME)

In [3]:
print(f'Number of trainable parameters: {t5.num_trainable_params():,}')

Number of trainable parameters: 60,506,624


## Example of Translation

In [4]:
t5.predict_sample('translate English to German: The house is wonderful.')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'dascom | hp 32'

In [5]:
t5.predict_sample('translate English to French: The house is wonderful.')

'hp inc | housewife p27'

### Cleaning Product Names for Monitors

In [6]:
t5.predict_sample('Clean Monitors: asus rog swift pg35vq 35"" led ultrawide quadhd 200hz g-sync ultimate curva.')

'asus | rog swift pg35vq'

In [7]:
t5.predict_sample('Clean Monitors: monitor 49" philips bdl4970el/00 led culoare negru - bdl4970el/00')

'philips | bdl4970el'

In [8]:
t5.predict_sample('Clean Monitors: монитор philips 21.5" 223s7ehmb/00 black')

'philips | 223s7ehmb/00'

### Cleaning Product Names for Printers

In [9]:
t5.predict_sample('Clean Printers: hp color laserjet enterprise m553n')

'hp inc | color laserjet'

In [10]:
t5.predict_sample('Clean Printers: 惠普hp laser mfp 138pn黑白激光传真多功能一体机有线网络')

'hp inc | laser 138pn'

## Load the Data

In [11]:
# load datasets
monitors = load_dataset('csv', data_files={
    # 'train': DATA_DIR + 'monitors_translation_202107_train.csv',
    # 'validation': DATA_DIR + 'monitors_translation_202107_val.csv',
    'test': DATA_DIR + 'monitors_translation_202107_test.csv'})
monitors = data.add_column(monitors, name='type', value='monitors')
printers = load_dataset('csv', data_files={
    # 'train': DATA_DIR + 'printers_translation_202107_train.csv',
    # 'validation': DATA_DIR + 'printers_translation_202107_val.csv',
    'test': DATA_DIR + 'printers_translation_202107_test.csv'
})
printers = data.add_column(printers, name='type', value='printers')
datasets = data.concat_datasets(monitors, printers)


# tokenize datasets
tokenized_monitors = t5.tokenize_dataset(monitors, prefix='Clean Monitors:')
tokenized_printers = t5.tokenize_dataset(printers, prefix='Clean Printers:')
tokenized_datasets = data.concat_datasets(tokenized_monitors, tokenized_printers)

datasets

Using custom data configuration default-6f7ff22b63bbd3bb


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/ec2-user/.cache/huggingface/datasets/csv/default-6f7ff22b63bbd3bb/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

Using custom data configuration default-1b8688a92c6a59f7


Dataset csv downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/csv/default-6f7ff22b63bbd3bb/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/ec2-user/.cache/huggingface/datasets/csv/default-1b8688a92c6a59f7/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/csv/default-1b8688a92c6a59f7/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23. Subsequent calls will reuse this data.


  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['inp', 'trg', 'metadata', 'type'],
        num_rows: 30000
    })
})

## Evaluate the Results

In [12]:
# get test predictions
out = t5.predict(
    tokenized_datasets['test'], output_dir='.', bs=64,
    output_probs=True, max_length=32, log_level='error')

In [18]:
import json


# create test dataframe with preds
test_df = data.create_predictions_df(t5.tokenizer, datasets['test'], out)

# decode metadata and extract region and country
test_df['metadata'] = test_df['metadata'].apply(lambda x: json.loads(x.replace('\'', '"')))
test_df['region'] = test_df['metadata'].apply(lambda d: d.get('Region', 'N/A'))
test_df['country'] = test_df['metadata'].apply(lambda d: d.get('Country', 'N/A'))
test_df = test_df.drop(columns=['metadata'])

test_df

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
0,lg 20mp48a-p 20″ ips led monitor,lg electronics | 20mp48a-p,monitors,lg electronics | 20mp48a-p,0.930942,"[0.99999595, 0.93094563]",1,1.00,1.00,APeJC,Bangladesh
1,"monitors dell p2419h, 23.8"", 5 ms",dell | p2419h,monitors,dell | p2419h,0.999882,"[0.99999976, 0.99988234]",1,1.00,1.00,Central & Eastern Europe,Latvia
2,lenovo / レノボ thinkvision t23d-10 61c3mar6jp,lenovo | t23d-10,monitors,lenovo | t23d-10,0.874605,"[0.9999993, 0.87460583]",1,1.00,1.00,Japan,Japan
3,hp n246v 23 8 ips display monitor 4616838,hp inc | n246v,monitors,hp inc | n246v,0.998867,"[0.9999962, 0.9988711]",1,1.00,1.00,Middle East & Africa,Nigeria
4,iiyama xb2481hs-b1,iiyama | xb2481hs,monitors,iiyama | xb2481hs,0.979453,"[0.9999949, 0.979458]",1,1.00,1.00,Central & Eastern Europe,Hungary
...,...,...,...,...,...,...,...,...,...,...,...
29995,mf216ns レーザー複合機 satera（サテラ）,canon | mf216n,printers,canon | imageclass mf216n,0.609783,"[0.9999982, 0.60978407]",0,0.56,0.75,Japan,
29996,oki data mc873dn - multifunction a3 color prin...,oki | mc873dn,printers,oki | mc873dn,0.999613,"[0.99999547, 0.999618]",1,1.00,1.00,Japan,
29997,samsung clp-680dwnordic laserprinter - farve -...,samsung | clp-680dw,printers,samsung | clp-680dw,0.999833,"[0.99999785, 0.9998349]",1,1.00,1.00,Western Europe,
29998,canon pixma ts8270 (white) printer,canon | pixma ts8270,printers,canon | pixma ts8270,0.999664,"[1.0, 0.9996637]",1,1.00,1.00,APeJC,


### Macro Averaged Metrics

In [20]:
test_df[['text_accuracy', 'levenshtein_score', 'jaccard_index']].mean(0)

text_accuracy        0.947333
levenshtein_score    0.988850
jaccard_index        0.976785
dtype: float64

In [21]:
def agg(group):
    out = group[['text_accuracy', 'levenshtein_score', 'jaccard_index']].mean()
    out['support'] = len(group)
    return out


test_df.groupby('type').apply(agg)

Unnamed: 0_level_0,text_accuracy,levenshtein_score,jaccard_index,support
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
monitors,0.937467,0.986737,0.971619,15000.0
printers,0.9572,0.990964,0.981952,15000.0


### Prediction Report across Regions and Countires

In [22]:
region_report = test_df.groupby('region').apply(agg)
region_report[region_report['support'] > 10].sort_values('text_accuracy')

Unnamed: 0_level_0,text_accuracy,levenshtein_score,jaccard_index,support
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PRC,0.90543,0.980922,0.956504,2707.0
USA,0.920194,0.982225,0.964489,1441.0
Central & Eastern Europe,0.930098,0.986534,0.968311,5007.0
Japan,0.933502,0.983758,0.972003,1579.0
Canada,0.937126,0.985814,0.974294,668.0
Middle East & Africa,0.943515,0.986731,0.975459,478.0
APeJC,0.952794,0.988854,0.979445,6461.0
Latin America,0.956633,0.992241,0.982448,1568.0
CEMA,0.968625,0.993372,0.986311,2741.0
Western Europe,0.969252,0.993742,0.98668,7350.0


In [23]:
country_report = test_df.groupby('country').apply(agg)
country_report[country_report['support'] > 10].sort_values('text_accuracy')

Unnamed: 0_level_0,text_accuracy,levenshtein_score,jaccard_index,support
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Israel,0.875,0.95668,0.933333,16.0
Korea,0.881517,0.961919,0.950632,211.0
Japan,0.88854,0.977161,0.948639,637.0
Nigeria,0.897436,0.990412,0.95641,39.0
Kazakhstan,0.897436,0.979258,0.962393,39.0
United Arab Emirates,0.9,0.974758,0.959074,60.0
Ukraine,0.901734,0.979097,0.954046,519.0
Egypt,0.906977,0.977105,0.957558,43.0
Romania,0.911111,0.981456,0.958034,585.0
Colombia,0.916667,0.993531,0.961111,36.0


### Prediction Examples

In [25]:
test_df.sort_values('levenshtein_score', ascending=False).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
0,lg 20mp48a-p 20″ ips led monitor,lg electronics | 20mp48a-p,monitors,lg electronics | 20mp48a-p,0.930942,"[0.99999595, 0.93094563]",1,1.0,1.0,APeJC,Bangladesh
19802,브랜드명 삼성전자 상품명 sl-j3520w 잉크포함 잉크젯프린터기 dt,samsung | sl-j3520w,printers,samsung | sl-j3520w,0.999856,"[0.9999924, 0.99986374]",1,1.0,1.0,APeJC,
19800,мултифункционално мастиленоструйно цветно устр...,canon | pixma mg5750,printers,canon | pixma mg5750,0.999958,"[1.0, 0.99995816]",1,1.0,1.0,CEMA,
19799,"canon lbp351x network printer (printing only, ...",canon | lbp351x,printers,canon | lbp351x,0.99568,"[0.9999999, 0.9956799]",1,1.0,1.0,APeJC,
19798,lexmark - xc4140,lexmark | xc4140,printers,lexmark | xc4140,0.998558,"[0.99999964, 0.9985587]",1,1.0,1.0,Western Europe,
19797,"lexmark cx522ade spalvotas, spalvotas lazerini...",lexmark | cx522ade,printers,lexmark | cx522ade,0.999498,"[0.99999976, 0.9994978]",1,1.0,1.0,CEMA,
19796,epson epson ecotank l3050 inkjet printer,epson | l3050,printers,epson | l3050,0.99935,"[0.99998105, 0.9993688]",1,1.0,1.0,CEMA,
19795,"kyocera ecosys m2040dn, multifunktionsdrucker ...",kyocera document solutions | ecosys m2040dn,printers,kyocera document solutions | ecosys m2040dn,0.99991,"[0.99999535, 0.99991477]",1,1.0,1.0,Western Europe,
19794,brother dcp-t710w wireless adf printer inkjet ...,brother | dcp-t710w,printers,brother | dcp-t710w,0.999781,"[1.0, 0.9997809]",1,1.0,1.0,APeJC,
19793,epson ecotank l810 rašalinis spausdintuvas,epson | l810,printers,epson | l810,0.999522,"[0.99999213, 0.99952996]",1,1.0,1.0,CEMA,


In [26]:
test_df[(test_df['levenshtein_score'] < 1) & (test_df['prob'] < 0.9)].sort_values('levenshtein_score', ascending=True).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
4485,"24"" ips 1080p ergonomic frameless monitor hdmi...","viewsonic | turbo-x monitor 27"" m2735 frameless",monitors,hp inc | hp 24fw,0.217212,"[0.88254136, 0.24612123]",0,0.170213,0.1,USA,USA
1660,benq zowie 24.5in fhd tn 240hz dyac+ freesync ...,benq | 2879 uhd freesync gaming,monitors,benq | xl2546k,0.837055,"[0.99999833, 0.8370567]",0,0.225806,0.285714,APeJC,Australia
13391,aoc 27in qhd ips 75hz freesync frameless monit...,aoc | q27t1,monitors,aoc | 2879 uhd freesync gaming,0.436789,"[0.9999969, 0.43678996]",0,0.233333,0.285714,APeJC,Australia
11046,aoc 31.5in qhd 144hz freesync curved gaming mo...,aoc | cq32g2e,monitors,aoc | 2879 uhd freesync gaming,0.566263,"[0.99999774, 0.56626403]",0,0.233333,0.285714,APeJC,Australia
7130,aoc c24g2 165hz fhd curved freesync va gaming ...,aoc | 2879 uhd freesync gaming,monitors,aoc | c24g2,0.848324,"[0.9999975, 0.8483266]",0,0.233333,0.285714,APeJC,Singapore
13787,"dell c-series 54.6"" 4k 3840x2160 conference ro...","dell | othersledtuneless28""w ips 3840x2160",monitors,dell | starring 24-inch full-hd monitor,0.572416,"[0.9999975, 0.5724169]",0,0.238095,0.222222,APeJC,New Zealand
23786,ricoh sp 150su 3 in 1 a4 high speed mono laser...,ricoh | sp 150su,printers,canon | imageclass mf221d,0.319093,"[0.5012545, 0.63658863]",0,0.24,0.142857,APeJC,
5712,"nec 48"" ceiling 4-display menu board",nec | pro display xdr,monitors,nec | samsung bdm4065uc,0.000251,"[0.9999994, 0.0002507089]",0,0.26087,0.285714,USA,USA
25838,epson l3150 wireless mobile wifi ink tank prin...,epson | l3150,printers,brother | dcp-t510w,0.76388,"[0.91341907, 0.83628637]",0,0.263158,0.2,APeJC,
25176,理光2014 d ad黑白激光a4a3打印机一体机复印机扫描替代1813l企业采购 2014...,ricoh | mp 2014d,printers,kyocera document solutions | taskalfa 2014d,0.676684,"[0.6814807, 0.99296105]",0,0.27907,0.25,PRC,


In [27]:
test_df[(test_df['levenshtein_score'] < 1) & (test_df['prob'] >= 0.9)].sort_values('levenshtein_score', ascending=True).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
879,spot delivery aoc monitor aoc 21.5” led monito...,"aoc | daewoo 21.5"" led monitor",monitors,aoc | 22b1hs,0.983521,"[0.9999924, 0.98352826]",0,0.233333,0.285714,APeJC,Philippines
315,"asus vp248h gaming monitor 24 inch, full hd, 1...",asus | starring 24-inch full-hd monitor,monitors,asus | vp248h,0.958553,"[1.0, 0.9585531]",0,0.25641,0.285714,APeJC,Singapore
10032,"【original product】asus vp278qg 27"" freesync ga...",asus | 2879 uhd freesync gaming,monitors,asus | vp278qg,0.996069,"[0.9999993, 0.99606997]",0,0.258065,0.285714,APeJC,Philippines
6752,benq 32in qhd curved 144hz hdr freesync 2 gami...,benq | 2879 uhd freesync gaming,monitors,benq | ex3203r,0.978614,"[0.9999957, 0.978618]",0,0.258065,0.285714,APeJC,Australia
13514,"philips lcd 65"" b-line 16/7, d-led, ips, 3840x...","philips | othersledtuneless28""w ips 3840x2160",monitors,philips | 65bfl2114,0.996113,"[0.9999927, 0.99612075]",0,0.288889,0.333333,Central & Eastern Europe,Czech Republic
13897,"asus vp28uqgl 28"" 4k uhd freesync gaming monitor",asus | 2879 uhd freesync gaming,monitors,asus | vp28uqg,0.991476,"[1.0, 0.99147606]",0,0.290323,0.285714,Western Europe,United Kingdom
29827,京瓷 p1025,kyocera document solutions | ecosys p1025,printers,pantum | p1025,0.994077,"[0.9945008, 0.99957454]",0,0.292683,0.285714,PRC,
2020,"hp v22 21.5"" led monitor (9sv79aa, tn, 1920 x ...","hp inc | daewoo 21.5"" led monitor",monitors,hp inc | v22,0.998388,"[0.9999975, 0.99839]",0,0.30303,0.375,APeJC,Malaysia
16837,京瓷 p2135d,kyocera document solutions | ecosys p2135d,printers,pantum | p2135d,0.972446,"[0.99731106, 0.9750686]",0,0.309524,0.285714,PRC,
891,viewsonic 22” 1080p 75hz frameless ips monitor...,"viewsonic | turbo-x monitor 27"" m2735 frameless",monitors,viewsonic | va2232-h,0.986582,"[0.9999999, 0.98658246]",0,0.319149,0.25,APeJC,Singapore


In [28]:
test_df[(test_df['levenshtein_score'] < 1) & (test_df['prob'] >= 0.9) & (test_df['type'] == 'printers')].sort_values('levenshtein_score', ascending=True).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
29827,京瓷 p1025,kyocera document solutions | ecosys p1025,printers,pantum | p1025,0.994077,"[0.9945008, 0.99957454]",0,0.292683,0.285714,PRC,
16837,京瓷 p2135d,kyocera document solutions | ecosys p2135d,printers,pantum | p2135d,0.972446,"[0.99731106, 0.9750686]",0,0.309524,0.285714,PRC,
24159,京瓷 p4040dn,kyocera document solutions | ecosys p4040dn,printers,pantum | p4040dn,0.9592,"[0.9815513, 0.9772282]",0,0.325581,0.285714,PRC,
26158,京瓷 p2235dn,kyocera document solutions | ecosys p2235dn,printers,pantum | p2235dn,0.992875,"[0.9942345, 0.99863285]",0,0.325581,0.285714,PRC,
19713,京瓷 p3050dn,kyocera document solutions | ecosys p3050dn,printers,pantum | p3050dn,0.986827,"[0.9883691, 0.99844027]",0,0.325581,0.285714,PRC,
20595,佳能 d1150,canon | imageclass d1150,printers,sindoh | d1150,0.999171,"[0.99992454, 0.9992462]",0,0.375,0.4,PRC,
23254,富士施乐s2520nda和s2320nd系列施乐复印机黑白a3打印机一体机扫描 s2320n...,xerox | 32,printers,xerox | docucentre s2320nd,0.974015,"[0.99599254, 0.9779342]",0,0.384615,0.4,PRC,
21080,hp 8500a(a910n),hp inc | 910,printers,hp inc | officejet 8500a,0.94475,"[0.9999455, 0.9448011]",0,0.416667,0.5,PRC,
26016,d 2630,hp inc | deskjet 2630,printers,sindoh | d 2630,0.970894,"[0.98963845, 0.98105925]",0,0.428571,0.285714,CEMA,
28512,ricoh sp c841dn,oki | c841dn,printers,ricoh | sp c841dn,0.997798,"[0.9996913, 0.9981059]",0,0.529412,0.4,Western Europe,


In [29]:
test_df[(test_df['levenshtein_score'] < 1)].sort_values('prob', ascending=True).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
5712,"nec 48"" ceiling 4-display menu board",nec | pro display xdr,monitors,nec | samsung bdm4065uc,0.000251,"[0.9999994, 0.0002507089]",0,0.26087,0.285714,USA,USA
22104,ricoh equipos multifuncion laser color ricoh s...,ricoh | sp c261sfnw,printers,ricoh | sp c2730dn,0.00684,"[0.9988485, 0.006848003]",0,0.736842,0.6,Western Europe,
6656,"nec led monitor - 4k - 27""","nec | daewoo 20"" led monitor",monitors,nec | acer group | b27,0.010037,"[1.0, 0.06980359, 0.14391625]",0,0.321429,0.222222,USA,USA
29343,multifuncional jato de tinta canon pixma maxx ...,canon | pixma g3110,printers,canon | pixma tini2440,0.017541,"[0.99999964, 0.017541131]",0,0.681818,0.6,Latin America,
22119,impresora inyección tinta color canon pixma ...,canon | pixma mg3650s,printers,canon | pixma ts3140,0.040045,"[0.99999917, 0.040045228]",0,0.761905,0.6,Western Europe,
15812,impresora multifuncional tinta color canon pix...,canon | pixma ts6350,printers,canon | pixma ts3140,0.048621,"[0.9999993, 0.048621032]",0,0.85,0.6,Western Europe,
21910,ecosys a4対応モノクロレーザープリンター 45ppm,kyocera document solutions | ecosys p3045dn,printers,kyocera document solutions | ecosys p4530dn,0.055352,"[0.99997866, 0.05535366]",0,0.906977,0.714286,Japan,
191,"nec display 55"" thin-depth commercial display",nec | pro display xdr,monitors,nec | adtechno display,0.077111,"[1.0, 0.07711126]",0,0.5,0.5,USA,USA
29412,epson workforce 30 color inkjet printer (c11ca...,epson | workforce 30,printers,epson | workforce wf-c11ca19201,0.078111,"[0.9999051, 0.07811792]",0,0.612903,0.6,USA,
29348,xerox promo 2 of c505/s printer,xerox | versalink c505/s,printers,xerox | promo 2nd c505/s,0.07993,"[0.99268115, 0.08051964]",0,0.666667,0.5,Canada,


In [30]:
test_df[test_df['jaccard_index'] < 1].sort_values('jaccard_index', ascending=True).head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
4485,"24"" ips 1080p ergonomic frameless monitor hdmi...","viewsonic | turbo-x monitor 27"" m2735 frameless",monitors,hp inc | hp 24fw,0.217212,"[0.88254136, 0.24612123]",0,0.170213,0.1,USA,USA
19076,沖データ　ビジネスledモノクロプリンタ　白　b801n,oki | b801n,printers,konica minolta | bizhub 801n,0.796221,"[0.8871982, 0.89745617]",0,0.357143,0.142857,Japan,
23786,ricoh sp 150su 3 in 1 a4 high speed mono laser...,ricoh | sp 150su,printers,canon | imageclass mf221d,0.319093,"[0.5012545, 0.63658863]",0,0.24,0.142857,APeJC,
24511,mc250fwb colour laser multifunction - a4 colou...,ricoh | m c250fwb,printers,oki | mc250fwb,0.897693,"[0.9057211, 0.99113625]",0,0.647059,0.166667,Western Europe,
16493,[캐논] 1a 캐논 fax-l170 흑백 레이저 인쇄 팩스 팩시밀리,canon | fax l170,printers,brother | fax-l170,0.91986,"[0.9358183, 0.98294735]",0,0.555556,0.166667,APeJC,
22199,沧田中税ts620kⅱ24针平推针式打印机连续打印增值税发票营改增出库单快递单票据二维码 官...,sealand | ts620k,printers,canon | pixma ts620kii,0.52023,"[0.53808916, 0.9668106]",0,0.409091,0.166667,PRC,
15677,[캐논코리아비즈니스솔루션] fax-l170 레이저팩스 프린터 팩스,canon | fax l170,printers,brother | fax-l170,0.978146,"[0.9888482, 0.98917747]",0,0.555556,0.166667,APeJC,
25458,[캐논] 캐논 fax-l150 흑백 레이저 복합기 정품/인쇄/a4/prin,canon | fax l150,printers,brother | fax-l150,0.785139,"[0.92220354, 0.85137266]",0,0.555556,0.166667,APeJC,
17558,ml421 black 9pin wide 570cps epson ibm par/usb,oki | microline 421,printers,epson | ml421,0.844945,"[0.99776477, 0.84683746]",0,0.315789,0.166667,Canada,
10012,rx-22e,ag neovo | rx-22,monitors,benq | rx-22e,0.486953,"[0.4972534, 0.9792853]",0,0.5,0.166667,Western Europe,Denmark


In [31]:
test_df[(test_df['levenshtein_score'] < 1) & (test_df['jaccard_index'] == 1)].head(10)

Unnamed: 0,inp,trg,type,pred,prob,prob_ext,text_accuracy,levenshtein_score,jaccard_index,region,country
486,[msi] optix g32cq4 커브드 게이밍 165 아이세이버 [무결점],msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.764089,"[0.9999895, 0.76409715]",0,0.818182,1.0,APeJC,Korea
617,msi 옵틱스 g32cq4 커브드 게이밍 144 아이세이버 모니터 (무결점),msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.895467,"[0.9999901, 0.89547545]",0,0.818182,1.0,APeJC,Korea
918,"31,5"" msi optix g32cq4",msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.644973,"[0.999995, 0.64497626]",0,0.818182,1.0,Central & Eastern Europe,Slovakia
2463,"msi optix g24vc 23.6"" curved gaming monitor (f...",msi | optix g24vc,monitors,msi | msi optix g24vc,0.598999,"[0.9999945, 0.59900266]",0,0.809524,1.0,APeJC,Malaysia
3252,msi optix g32cq4 31.5inch va curved wqhd 165hz...,msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.716287,"[0.99999404, 0.7162917]",0,0.818182,1.0,APeJC,Australia
3788,msi optix g27c2 full hd curved screen led lcd ...,msi | msi optix g27c2,monitors,msi | optix g27c2,0.648259,"[0.9999913, 0.6482643]",0,0.809524,1.0,Canada,Canada
4376,"msi 32"" curved gaming wqhd monitor g32cq4",msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.701266,"[0.9999906, 0.7012731]",0,0.818182,1.0,APeJC,Australia
5586,monitorius msi optix g241,msi | msi optix g241,monitors,msi | optix g241,0.919612,"[0.99997735, 0.91963285]",0,0.8,1.0,Central & Eastern Europe,Lithuania
8885,"msi optix g241 24"" ips gaming 144hz monitor",msi | msi optix g241,monitors,msi | optix g241,0.8568,"[0.99999475, 0.85680485]",0,0.8,1.0,APeJC,Sri Lanka
8918,(주사율165hz/wqhd/32형) msi 옵틱스 g32cq4 커브드 게이밍 165...,msi | msi optix g32cq4,monitors,msi | optix g32cq4,0.826543,"[0.9999671, 0.8265699]",0,0.818182,1.0,APeJC,Korea
