# Notebook eksperimen skripsi

## Import library and dataset

In [3]:
import pandas as pd
import Levenshtein
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
from fuzzywuzzy import fuzz
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.spatial.distance import cosine


In [2]:
data = pd.read_json('../data/final_data/data_all.jsonl', lines=True)

## Pre-Processing

In [None]:
# Hitung jumlah dan persentase match_type
match_count = data['match_type'].value_counts()
total_count = len(data)

percentage_match = (match_count['Match'] / total_count) * 100 if 'Match' in match_count else 0
percentage_non_match = (match_count['Non-Match'] / total_count) * 100 if 'Non-Match' in match_count else 0

print(f"Jumlah Baris Match: {match_count.get('Match', 0)} ({percentage_match:.2f}%)")
print(f"Jumlah Baris Non-Match: {match_count.get('Non-Match', 0)} ({percentage_non_match:.2f}%)")
print(f"Jumlah Baris: {total_count}")

In [None]:
# Rename & Replace nama dan nilai kolom
data.rename(columns={'match_type': 'match'}, inplace=True)
data['match'] = data['match'].map({'Match': 1, 'Non-Match': 0})
data

## Baseline Models (Decission Tree)

### Levenshtein Distance

In [6]:
def calculate_levenshtein(row):
    c1 = row['c1'].lower()
    c2 = row['c2'].lower()
    return Levenshtein.distance(c1, c2)

data['levensthein_distance'] = data.apply(calculate_levenshtein, axis=1)


**Data Split and Model Training**

In [None]:
# Pilih fitur dan target
features = data[['levensthein_distance']] 
target = data['match']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Melatih model Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Melakukan prediksi
y_pred = model.predict(X_test)

# Menampilkan hasil
print(classification_report(y_test, y_pred))

### N-Gram Similarity (with Jaccard)
Buat n-gram, lalu cari distancenya menggunakan Jaccard

In [8]:
# Fungsi untuk membuat n-gram (Bi-Gram 2)
def ngram_generator(text, n=2):
    """
    text: String yang akan dipecah menjadi n-gram
    n : panjang n-gram yang akan di buat
    """
    return [text[i:i+n] for i in range(len(text) - n + 1)]

# Fungsi untuk menghitung n-gram similarity
def calculate_ngram_similarity(row):
    """
    row: baris pada dataframe (c1,c2)
    n: panjang n-gram
    """

    c1 = row['c1'].lower()
    c2 = row['c2'].lower()

    ngrams_c1 = ngram_generator(c1)
    ngrams_c2 = ngram_generator(c2)

    # Menggunakan MultiLabelBinarizer untuk menghitung Jaccard similarity
    mlb = MultiLabelBinarizer()
    # ngrams = list(set(ngrams_c1) | set(ngrams_c2))  # Semua n-gram unik
    binarized = mlb.fit_transform([ngrams_c1, ngrams_c2])
    
    return jaccard_score(binarized[0], binarized[1])

data['ngram_similarity'] = data.apply(calculate_ngram_similarity, axis=1)

**Data Split and Model Training**

In [None]:
# Pilih fitur dan target
features = data[['ngram_similarity']] 
target = data['match']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Melatih model Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Melakukan prediksi
y_pred = model.predict(X_test)

# Menampilkan hasil
print(classification_report(y_test, y_pred))

### Token Sort Ratio

Token sort ratio tidak peduli urutan kata dalam suatu kalimat, Cocok untuk company matching

Contoh:
Bank Mandiri - Mandiri Bank

Bank Central Asia - Asia Central Bank

Bayer AG - AG Bayer

Nestlé S.A. - S.A. Nestlé

In [None]:
# Check the similarity score with different names
full_name_1 = "Alice B. Johnson"
full_name_2 = "Johnson Alice B."

# Order does not matter for token sort ratio
print(f"Token sort ratio similarity score: {fuzz.token_sort_ratio(full_name_2, full_name_1)}")

# Order matters for partial ratio
print(f"Partial ratio similarity score: {fuzz.partial_ratio(full_name_1, full_name_2)}")

# Order will not affect simple ratio if strings do not match
print(f"Simple ratio similarity score: {fuzz.ratio(full_name_1, full_name_2)}")


In [11]:
# Fungsi untuk menghitung token sort ratio
def calculate_token_partial_ratio(row):
    c1 = row['c1'].lower()
    c2 = row['c2'].lower()
    return fuzz.partial_ratio(c1, c2)

data['token_partial_ratio'] = data.apply(calculate_token_partial_ratio, axis=1)

**Data Split and Model Training**

In [None]:
# Pilih fitur dan target
features = data[['token_partial_ratio']] 
target = data['match']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Melatih model Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Melakukan prediksi
y_pred = model.predict(X_test)

# Menampilkan hasil
print(classification_report(y_test, y_pred))

# Dataframe final

In [None]:
selected_data = data[["c1","c2","levensthein_distance","ngram_similarity","token_partial_ratio","match"]]
selected_data

## Ensemble Learning

Menambahkan fitur baru

In [None]:
# Menghitung jumlah kata dan jumlah karakter
data['number_of_words_c1'] = data['c1'].apply(lambda x: len(x.split()))
data['number_of_characters_c1'] = data['c1'].apply(lambda x: len(x))

data['number_of_words_c2'] = data['c2'].apply(lambda x: len(x.split()))
data['number_of_characters_c2'] = data['c2'].apply(lambda x: len(x))

data

### XGBoost

Combine all feature

In [None]:
# Pilih fitur dan target
features = data[['levensthein_distance','ngram_similarity','token_partial_ratio']] 
target = data['match']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Melatih model Decision Tree
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Melakukan prediksi
y_pred = model.predict(X_test)

# Menampilkan hasil
print(classification_report(y_test, y_pred))

In [None]:
# Pilih fitur dan target
features = data[['levensthein_distance','ngram_similarity','token_partial_ratio', 'number_of_words_c1', 'number_of_words_c2', 'number_of_characters_c1', 'number_of_characters_c2']] 
target = data['match']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Melatih model Decision Tree
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Melakukan prediksi
y_pred = model.predict(X_test)

# Menampilkan hasil
print(classification_report(y_test, y_pred))

## Deep Learning

1. Pre-processing context (tanda baca, dll)
2. Tokenization (perlukah ?)
3. Vectorization
4. Training BERT model
5. Testing/Evaluation 

### Tanpa Konteks

#### Buat Fitur

BERT Embeddings (c1,c2)

In [None]:
# from transformers import BertTokenizer, BertModel
# import torch

# # Memuat tokenizer dan model BERT
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # Fungsi untuk mendapatkan embedding dari teks menggunakan BERT
# def get_embedding(text):
#     inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     # Mengambil embedding dari token [CLS] (index 0)
#     return outputs.last_hidden_state[:, 0, :].numpy()

# # Menghitung embedding untuk kolom c1 dan c2
# data['c1_embedding'] = data['c1'].apply(lambda x: get_embedding(x))
# data['c2_embedding'] = data['c2'].apply(lambda x: get_embedding(x))

BERT Embeddings (c1_context, c2_context)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Memuat tokenizer dan model BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Fungsi untuk mendapatkan embedding dari teks menggunakan BERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mengambil embedding dari token [CLS] (index 0)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Menghitung embedding untuk kolom c1 dan c2
data['c1_context_embedding'] = data['c1_context'].apply(lambda x: get_embedding(x))
data['c2_context_embedding'] = data['c2_context'].apply(lambda x: get_embedding(x))

In [16]:
# Mengubah embedding menjadi array NumPy
data['c1_context_embedding'] = data['c1_context_embedding'].apply(lambda x: np.array(x))
data['c2_context_embedding'] = data['c2_context_embedding'].apply(lambda x: np.array(x))

# Mengubah embedding menjadi array NumPy
data['c1_embedding'] = data['c1_embedding'].apply(lambda x: np.array(x))
data['c2_embedding'] = data['c2_embedding'].apply(lambda x: np.array(x))

In [17]:
data.to_json('../data/final_data/temp.jsonl', orient='records', lines=True)

In [4]:
data = pd.read_json('../data/final_data/temp.jsonl', lines=True)

Cosine Similarity

In [9]:
# Fungsi untuk menghitung cosine similarity
def cosine_similarity(c1_embedding, c2_embedding):
    c1_embedding = np.array(c1_embedding).flatten()
    c2_embedding = np.array(c2_embedding).flatten()
    return 1 - cosine(c1_embedding, c2_embedding)  # 1 - cosine distance

data['cosine_similarity'] = data.apply(lambda row: cosine_similarity(row['c1_embedding'], row['c2_embedding']), axis=1)

# Buat DataFrame baru
deep_learning_no_context = data[['c1', 'c2', 'c1_embedding', 'c2_embedding', 'cosine_similarity', 'match']]


In [None]:
deep_learning_no_context

In [11]:
deep_learning_no_context.to_json('../data/final_data/deep_learning_no_context.jsonl', orient='records', lines=True)

Selain cosine similarity bisa tambahkan:
- L1 (Manhattan Distance)
- L2 (Euclidean Distance)
- L-infinity (Chebyshev Distance)


### Dengan Konteks
Coba dengan dan tanpa preprocessing context

Notes

Real Use Case: ketika ada 25000an entitas, dan dikasih 1 entitas, diminta untuk matching dia sebenernya entitas yang mana.

Untuk case fauzan: Setelah coba selesaikan binary classif, masuk ke real use case.

-Kendala:
Kalau pakai algo machine learning (XGBoost), gimana cara lookup data.

Ketika punya data baru, bandingin dengan cosine similarity (mana yg dekat)


Evaluations:
- Binary classification hitnung accuracy, dll
- Evaluasi look up

Notes

Model bert yang udah di pretrained, 

2500 entity (embedded) di store di numpy array untuk lookup. (metrics pake precission @1, @10 & recall)