# Notebook eksperimen skripsi

## Import library and dataset

In [133]:
import pandas as pd
import Levenshtein

In [134]:
data = pd.read_json('../data/final_data/data_all.jsonl', lines=True)

## Pre-Processing

In [135]:
# Hitung jumlah dan persentase match_type
match_count = data['match_type'].value_counts()
total_count = len(data)

percentage_match = (match_count['Match'] / total_count) * 100 if 'Match' in match_count else 0
percentage_non_match = (match_count['Non-Match'] / total_count) * 100 if 'Non-Match' in match_count else 0

print(f"Jumlah Baris Match: {match_count.get('Match', 0)} ({percentage_match:.2f}%)")
print(f"Jumlah Baris Non-Match: {match_count.get('Non-Match', 0)} ({percentage_non_match:.2f}%)")
print(f"Jumlah Baris: {total_count}")

Jumlah Baris Match: 12125 (52.19%)
Jumlah Baris Non-Match: 11109 (47.81%)
Jumlah Baris: 23234


In [136]:
# Rename & Replace nama dan nilai kolom
data.rename(columns={'match_type': 'match'}, inplace=True)
data['match'] = data['match'].map({'Match': 1, 'Non-Match': 0})
data

Unnamed: 0,c1,c1_context,c2,c2_context,match
0,anz,The Australia and New Zealand Banking Group Li...,ANZ,We provide banking and financial products and ...,1
1,anz,The Australia and New Zealand Banking Group Li...,Australian New Zealand Banking Group,ANZ\r\n- The Australian New Zealand Banking Gr...,1
2,anz,The Australia and New Zealand Banking Group Li...,A.N.Z.,We provide banking and financial products and ...,1
3,anz,The Australia and New Zealand Banking Group Li...,ANZ Bank,ANZ Bank New Zealand Limited operates as a ban...,1
4,anz,The Australia and New Zealand Banking Group Li...,Anz,We provide banking and financial products and ...,1
...,...,...,...,...,...
23229,aerotek,Aerotek\nprovides staffing and services soluti...,AeroVironment,AeroVironment (NASDAQ: AVAV) is a technology s...,0
23230,aerotek,Aerotek\nprovides staffing and services soluti...,Aeropostale,Aéropostale is a specialty retailer of high-qu...,0
23231,michigan state university,Michigan State University is the nation's prem...,Michigan Technological University,Michigan Technological University is a flagshi...,0
23232,michigan state university,Michigan State University is the nation's prem...,Central Michigan University,Central Michigan University is a leading publi...,0


## Baseline Models (Decission Tree)

### Levenshtein Distance

In [137]:
def calculate_levenshtein(row):
    c1 = row['c1'].lower()
    c2 = row['c2'].lower()
    return Levenshtein.distance(c1, c2)

In [138]:
data['levensthein_distance'] = data.apply(calculate_levenshtein, axis=1)
data

Unnamed: 0,c1,c1_context,c2,c2_context,match,levensthein_distance
0,anz,The Australia and New Zealand Banking Group Li...,ANZ,We provide banking and financial products and ...,1,0
1,anz,The Australia and New Zealand Banking Group Li...,Australian New Zealand Banking Group,ANZ\r\n- The Australian New Zealand Banking Gr...,1,33
2,anz,The Australia and New Zealand Banking Group Li...,A.N.Z.,We provide banking and financial products and ...,1,3
3,anz,The Australia and New Zealand Banking Group Li...,ANZ Bank,ANZ Bank New Zealand Limited operates as a ban...,1,5
4,anz,The Australia and New Zealand Banking Group Li...,Anz,We provide banking and financial products and ...,1,0
...,...,...,...,...,...,...
23229,aerotek,Aerotek\nprovides staffing and services soluti...,AeroVironment,AeroVironment (NASDAQ: AVAV) is a technology s...,0,8
23230,aerotek,Aerotek\nprovides staffing and services soluti...,Aeropostale,Aéropostale is a specialty retailer of high-qu...,0,6
23231,michigan state university,Michigan State University is the nation's prem...,Michigan Technological University,Michigan Technological University is a flagshi...,0,13
23232,michigan state university,Michigan State University is the nation's prem...,Central Michigan University,Central Michigan University is a leading publi...,0,14


### N-Gram Similarity (with Jaccard)
Buat n-gram, lalu cari distancenya menggunakan Jaccard

In [139]:
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

# Fungsi untuk membuat n-gram (Bi-Gram 2)
def ngram_generator(text, n=2):
    """
    text: String yang akan dipecah menjadi n-gram
    n : panjang n-gram yang akan di buat
    """
    return [text[i:i+n] for i in range(len(text) - n + 1)]

# Fungsi untuk menghitung n-gram similarity
def calculate_ngram_similarity(row):
    """
    row: baris pada dataframe (c1,c2)
    n: panjang n-gram
    """

    c1 = row['c1'].lower()
    c2 = row['c2'].lower()

    ngrams_c1 = ngram_generator(c1)
    ngrams_c2 = ngram_generator(c2)

    # Menggunakan MultiLabelBinarizer untuk menghitung Jaccard similarity
    mlb = MultiLabelBinarizer()
    # ngrams = list(set(ngrams_c1) | set(ngrams_c2))  # Semua n-gram unik
    binarized = mlb.fit_transform([ngrams_c1, ngrams_c2])
    
    return jaccard_score(binarized[0], binarized[1])

data['ngram_similarity'] = data.apply(calculate_ngram_similarity, axis=1)

### Token Sort Ratio

Token sort ratio tidak peduli urutan kata dalam suatu kalimat, Cocok untuk company matching

Contoh:
Bank Mandiri - Mandiri Bank

Bank Central Asia - Asia Central Bank

Bayer AG - AG Bayer

Nestlé S.A. - S.A. Nestlé

In [140]:
from fuzzywuzzy import fuzz

# Check the similarity score with different names
full_name_1 = "Alice B. Johnson"
full_name_2 = "Johnson Alice B."

# Order does not matter for token sort ratio
print(f"Token sort ratio similarity score: {fuzz.token_sort_ratio(full_name_2, full_name_1)}")

# Order matters for partial ratio
print(f"Partial ratio similarity score: {fuzz.partial_ratio(full_name_1, full_name_2)}")

# Order will not affect simple ratio if strings do not match
print(f"Simple ratio similarity score: {fuzz.ratio(full_name_1, full_name_2)}")


Token sort ratio similarity score: 100
Partial ratio similarity score: 67
Simple ratio similarity score: 50


In [141]:
# Fungsi untuk menghitung token sort ratio
def calculate_token_sort_ratio(row):
    c1 = row['c1'].lower()
    c2 = row['c2'].lower()
    return fuzz.token_sort_ratio(c1, c2)

data['token_sort_ratio'] = data.apply(calculate_token_sort_ratio, axis=1)

In [142]:
selected_data = data[["c1","c2","levensthein_distance","ngram_similarity","token_sort_ratio","match"]]
selected_data

Unnamed: 0,c1,c2,levensthein_distance,ngram_similarity,token_sort_ratio,match
0,anz,ANZ,0,1.000000,100,1
1,anz,Australian New Zealand Banking Group,33,0.030303,15,1
2,anz,A.N.Z.,3,0.000000,75,1
3,anz,ANZ Bank,5,0.333333,55,1
4,anz,Anz,0,1.000000,100,1
...,...,...,...,...,...,...
23229,aerotek,AeroVironment,8,0.214286,50,0
23230,aerotek,Aeropostale,6,0.230769,67,0
23231,michigan state university,Michigan Technological University,13,0.542857,76,0
23232,michigan state university,Central Michigan University,14,0.562500,73,0


## Ensemble Learning

### XGBoost

Combine all feature

## Deep Learning

1. Pre-processing context (tanda baca, dll)
2. Tokenization (perlukah ?)
3. Vectorization
4. Training BERT model
5. Testing/Evaluation 

### Tanpa Konteks

### Dengan Konteks
Coba dengan dan tanpa preprocessing context

Notes

Real Use Case: ketika ada 25000an entitas, dan dikasih 1 entitas, diminta untuk matching dia sebenernya entitas yang mana.

Untuk case fauzan: Setelah coba selesaikan binary classif, masuk ke real use case.

-Kendala:
Kalau pakai algo machine learning (XGBoost), gimana cara lookup data.

Ketika punya data baru, bandingin dengan cosine similarity (mana yg dekat)


Evaluations:
- Binary classification hitnung accuracy, dll
- Evaluasi look up