In [None]:
# ============================================
# 1) Mount Google Drive
# ============================================
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ============================================
# 2) Install Library Tambahan
# ============================================
!pip install --quiet rapidfuzz fuzzywuzzy python-Levenshtein nltk


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m2.0/3.1 MB[0m [31m28.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/161.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ============================================
# 3) Impor Library
# ============================================
import pandas as pd
import difflib
import nltk
from rapidfuzz.distance import Hamming as rf_hamming
import Levenshtein

nltk.download("punkt", quiet=True)

True

In [None]:
# ============================================
# 4) Fungsi Hamming Distance
# ============================================
def hamming_fuzzywuzzy(s1: str, s2: str) -> int:
    return Levenshtein.hamming(s1, s2)

def hamming_rapidfuzz(s1: str, s2: str) -> int:
    return rf_hamming.distance(s1, s2)

def hamming_nltk(s1: str, s2: str) -> int:
    if len(s1) != len(s2):
        raise ValueError("Panjang string harus sama untuk Hamming")
    return nltk.edit_distance(s1, s2, substitution_cost=1, transpositions=False)

def hamming_difflib(s1: str, s2: str) -> int:
    if len(s1) != len(s2):
        raise ValueError("Panjang string harus sama untuk Hamming")
    sim = difflib.SequenceMatcher(a=s1, b=s2).ratio()
    return round((1 - sim) * len(s1))

In [None]:
# ============================================
# 5) Load Dataset dari Google Drive
# ============================================
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/PoPCites.csv"
df = pd.read_csv(csv_path)

# Pisahkan kolom Authors jadi nama individual
author_list = []
for authors in df["Authors"].dropna():
    author_list.extend([a.strip() for a in authors.split(",")])

authors_unique = list(dict.fromkeys(author_list))  # hapus duplikat

In [None]:
# ============================================
# 6) Ambil 2 nama dengan panjang yang sama
# ============================================
a1, a2 = None, None
for i, name1 in enumerate(authors_unique):
    for name2 in authors_unique[i+1:]:
        if len(name1) == len(name2):
            a1, a2 = name1, name2
            break
    if a1: break

print(f"Contoh:\n  Nama 1: {a1}\n  Nama 2: {a2}\n  Panjang: {len(a1)}")


Contoh:
  Nama 1: I Lewaa
  Nama 2: M Amnai
  Panjang: 7


In [None]:
# ============================================
# 7) Hitung Hamming Distance
# ============================================
print("Hamming (FuzzyWuzzy / Levenshtein):", hamming_fuzzywuzzy(a1, a2))
print("Hamming (RapidFuzz):               ", hamming_rapidfuzz(a1, a2))
print("Hamming (NLTK):                    ", hamming_nltk(a1, a2))
print("Hamming (difflib):                 ", hamming_difflib(a1, a2))


Hamming (FuzzyWuzzy / Levenshtein): 5
Hamming (RapidFuzz):                5
Hamming (NLTK):                     5
Hamming (difflib):                  5


In [None]:
# ============================================
# 1) Ambil nama-nama dari 100 baris pertama
# ============================================
df_100 = df.head(100)  # 100 baris pertama
author_list = []
for authors in df_100["Authors"].dropna():
    author_list.extend([a.strip() for a in authors.split(",")])

authors_unique = list(dict.fromkeys(author_list))  # hapus duplikat

# ============================================
# 2) Filter nama-nama yang panjangnya sama
#    Misal: panjang 12 karakter
# ============================================
target_len = 12
authors_same_length = [a for a in authors_unique if len(a) == target_len]

print(f"Jumlah nama dengan panjang {target_len}: {len(authors_same_length)}")


Jumlah nama dengan panjang 12: 5


In [None]:
# ============================================
# 3) Buat Matriks Jarak Hamming (RapidFuzz)
# ============================================
import numpy as np
import pandas as pd

n = len(authors_same_length)
dist_matrix = pd.DataFrame(
    np.zeros((n, n), dtype=int),
    index=authors_same_length,
    columns=authors_same_length
)

# Hitung pairwise Hamming
for i in range(n):
    for j in range(i+1, n):
        s1 = authors_same_length[i]
        s2 = authors_same_length[j]
        d = hamming_rapidfuzz(s1, s2)  # bisa ganti ke hamming_nltk
        dist_matrix.iat[i, j] = d
        dist_matrix.iat[j, i] = d

# Tampilkan sebagian
dist_matrix.head(10)


Unnamed: 0,T Alkhalifah,JA Jakubiec…,AN Palazotto,G Siachamis…,K Stockinger
T Alkhalifah,0,11,11,11,11
JA Jakubiec…,11,0,10,10,12
AN Palazotto,11,10,0,11,12
G Siachamis…,11,10,11,0,9
K Stockinger,11,12,12,9,0


In [None]:
# 1) Install dependensi
# ===============================================================
!pip install --quiet rapidfuzz fuzzywuzzy python-Levenshtein nltk tqdm

# ===============================================================
# 2) Impor modul & definisi fungsi jarak Hamming
# ===============================================================
import pandas as pd
import difflib, itertools, nltk, os
from tqdm.auto import tqdm
from rapidfuzz.distance import Hamming as rf_hamming
import Levenshtein

nltk.download("punkt", quiet=True)

def hamming_fuzzywuzzy(s1: str, s2: str) -> int:
    return Levenshtein.hamming(s1, s2)

def hamming_rapidfuzz(s1: str, s2: str) -> int:
    return rf_hamming.distance(s1, s2)

def hamming_nltk(s1: str, s2: str) -> int:
    # edit_distance tanpa insert/delete → murni Hamming
    return nltk.edit_distance(s1, s2, substitution_cost=1, transpositions=False)

def hamming_difflib(s1: str, s2: str) -> int:
    # gunakan rasio kemiripan lalu ubah ke distance
    sim = difflib.SequenceMatcher(a=s1, b=s2).ratio()
    return round((1 - sim) * len(s1))

# ===============================================================
# 3) Baca CSV dari Drive
#    (ganti jalur jika lokasinya berbeda)
# ===============================================================
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/PoPCites.csv"
df = pd.read_csv(csv_path)

# ===============================================================
# 4) Ekstrak seluruh nama penulis unik
# ===============================================================
author_list = []
for authors in df["Authors"].dropna():
    author_list.extend([a.strip() for a in authors.split(",")])

authors_unique = list(dict.fromkeys(author_list))  # hapus duplikat, jaga urutan
print(f"Total nama penulis unik: {len(authors_unique)}")

# ===============================================================
# 5) Hitung Hamming Distance pairwise (hanya string dgn panjang sama)
#    → hasil disimpan ke CSV di Google Drive
# ===============================================================
out_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_distances.csv"

results = []
for s1, s2 in tqdm(itertools.combinations(authors_unique, 2), total=len(authors_unique)*(len(authors_unique)-1)//2):
    if len(s1) != len(s2):            # Hamming wajib panjang sama
        continue
    try:
        results.append({
            "author_1"       : s1,
            "author_2"       : s2,
            "length"         : len(s1),
            "ham_fuzzywuzzy" : hamming_fuzzywuzzy(s1, s2),
            "ham_rapidfuzz"  : hamming_rapidfuzz(s1, s2),
            "ham_nltk"       : hamming_nltk(s1, s2),
            "ham_difflib"    : hamming_difflib(s1, s2),
        })
    except Exception as e:            # seharusnya jarang terjadi
        print(f"Skip pair ({s1}, {s2}) → {e}")

pd.DataFrame(results).to_csv(out_path, index=False)
print(f"\n✅  Selesai!  Hasil jarak Hamming tersimpan di:\n{out_path}")

# (Opsional) intip 5 baris pertama
pd.read_csv(out_path).head()


Total nama penulis unik: 275


  0%|          | 0/37675 [00:00<?, ?it/s]


✅  Selesai!  Hasil jarak Hamming tersimpan di:
/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_distances.csv


Unnamed: 0,author_1,author_2,length,ham_fuzzywuzzy,ham_rapidfuzz,ham_nltk,ham_difflib
0,I Lewaa,M Amnai,7,5,5,5,5
1,I Lewaa,Y Zhang,7,6,6,6,5
2,I Lewaa,P Wang…,7,6,6,6,5
3,I Lewaa,DA Wood,7,7,7,7,6
4,I Lewaa,X Hong…,7,6,6,6,6


In [None]:
# ===============================================================
# 0) Mount Google Drive
# ===============================================================
from google.colab import drive
drive.mount("/content/drive")     # Izinkan akses Drive Anda

# ===============================================================
# 1) Instal dependensi
# ===============================================================
!pip install --quiet rapidfuzz fuzzywuzzy python-Levenshtein nltk tqdm

# ===============================================================
# 2) Import & definisi utilitas
# ===============================================================
import pandas as pd, numpy as np, itertools, difflib, nltk
from tqdm.auto import tqdm
from rapidfuzz.distance import Hamming as rf_hamming
import Levenshtein

nltk.download("punkt", quiet=True)

# ---------- Hamming distance helper ---------- #
def ham_dist_lev(s1, s2):   return Levenshtein.hamming(s1, s2)
def ham_dist_rf(s1, s2):    return rf_hamming.distance(s1, s2)
def ham_dist_nltk(s1, s2):  return nltk.edit_distance(s1, s2, substitution_cost=1, transpositions=False)

# Convert distance → similarity ratio (0-1)
def sim_from_dist(d, length):  return 1.0 - d/length

# ===============================================================
# 3) Muat CSV
# ===============================================================
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/PoPCites.csv"
df = pd.read_csv(csv_path)

# ===============================================================
# 4) Ekstrak seluruh nama penulis unik
# ===============================================================
author_list = []
for authors in df["Authors"].dropna():
    author_list.extend([a.strip() for a in authors.split(",")])

authors_unique = list(dict.fromkeys(author_list))   # buang duplikat
print(f"Total nama penulis unik: {len(authors_unique)}")

# ===============================================================
# 5) Hitung similarity pairwise (Hamming)
# ===============================================================
results = []
total_pairs = len(authors_unique)*(len(authors_unique)-1)//2
for s1, s2 in tqdm(itertools.combinations(authors_unique, 2), total=total_pairs, desc="pairwise"):
    if len(s1) != len(s2):      # skip jika panjang beda
        continue
    L = len(s1)
    try:
        d_lev = ham_dist_lev(s1, s2)
        d_rf  = ham_dist_rf(s1, s2)
        d_nlk = ham_dist_nltk(s1, s2)
        sim_diff = difflib.SequenceMatcher(a=s1, b=s2).ratio()   # langsung similarity

        results.append({
            "author_1"          : s1,
            "author_2"          : s2,
            "length"            : L,
            "sim_fuzzywuzzy"    : sim_from_dist(d_lev, L),
            "sim_rapidfuzz"     : sim_from_dist(d_rf,  L),
            "sim_nltk"          : sim_from_dist(d_nlk, L),
            "sim_difflib"       : sim_diff
        })
    except Exception as e:
        # Hampir tidak terjadi, tapi aman
        print("Skip:", s1, s2, "→", e)

df_sim = pd.DataFrame(results)
print("⏱️  Hitung selesai – pasangan tersimpan:", len(df_sim))

# ===============================================================
# 6) Simpan hasil ke Drive
# ===============================================================
out_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarity.csv"
df_sim.to_csv(out_path, index=False)
print("\n✅ File similarity tersimpan di:", out_path)

# ===============================================================
# 7) (Opsional) Tampilkan 10 baris contoh
# ===============================================================
df_sim.head(10)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total nama penulis unik: 275


pairwise:   0%|          | 0/37675 [00:00<?, ?it/s]

⏱️  Hitung selesai – pasangan tersimpan: 4852

✅ File similarity tersimpan di: /content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarity.csv


Unnamed: 0,author_1,author_2,length,sim_fuzzywuzzy,sim_rapidfuzz,sim_nltk,sim_difflib
0,I Lewaa,M Amnai,7,0.285714,0.285714,0.285714,0.285714
1,I Lewaa,Y Zhang,7,0.142857,0.142857,0.142857,0.285714
2,I Lewaa,P Wang…,7,0.142857,0.142857,0.142857,0.285714
3,I Lewaa,DA Wood,7,0.0,0.0,0.0,0.142857
4,I Lewaa,X Hong…,7,0.142857,0.142857,0.142857,0.142857
5,I Lewaa,A Alali,7,0.142857,0.142857,0.142857,0.285714
6,I Lewaa,V Kazei,7,0.142857,0.142857,0.142857,0.285714
7,I Lewaa,S Bader,7,0.142857,0.142857,0.142857,0.285714
8,I Lewaa,S Fomel,7,0.142857,0.142857,0.142857,0.285714
9,I Lewaa,M Raggi,7,0.142857,0.142857,0.142857,0.285714


In [None]:
import pandas as pd

# Path file CSV
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarity.csv"  # ganti sesuai kebutuhan

# Path file Excel tujuan
excel_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarity.xlsx"

# Baca CSV dan simpan sebagai Excel
df = pd.read_csv(csv_path)
df.to_excel(excel_path, index=False)

print("✅ File berhasil dikonversi ke Excel di:", excel_path)


✅ File berhasil dikonversi ke Excel di: /content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarity.xlsx


In [None]:
# ===============================================================
# 1. Install dan import library
# ===============================================================
!pip install -q rapidfuzz fuzzywuzzy python-Levenshtein nltk tqdm

import pandas as pd, difflib, nltk
from tqdm.auto import tqdm
from rapidfuzz.distance import Hamming as rf_hamming
import Levenshtein

nltk.download("punkt", quiet=True)

# ===============================================================
# 2. Baca file CSV
# ===============================================================
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/PoPCites.csv"  # ganti jika path berbeda
df = pd.read_csv(csv_path)

# Ambil kolom 'Authors', drop NA, dan reset index
authors = df["Authors"].dropna().astype(str).reset_index(drop=True)
print("Total baris dengan nama penulis:", len(authors))

# ===============================================================
# 3. Fungsi pembantu
# ===============================================================
def sim_from_dist(d, L): return 1.0 - d/L if L > 0 else 0.0

def sim_difflib(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).ratio()

# ===============================================================
# 4. Bandingkan antarbaris
# ===============================================================
results = []
N = len(authors)

for i in tqdm(range(N)):
    for j in range(i + 1, N):
        a1 = authors[i]
        a2 = authors[j]

        if len(a1) != len(a2):
            continue  # Hamming hanya bisa jika panjang sama

        L = len(a1)
        try:
            d_lev = Levenshtein.hamming(a1, a2)
            d_rf  = rf_hamming.distance(a1, a2)
            d_nlk = nltk.edit_distance(a1, a2, substitution_cost=1, transpositions=False)
            sim_df = sim_difflib(a1, a2)

            results.append({
                "row_1": i,
                "row_2": j,
                "author_1": a1,
                "author_2": a2,
                "length": L,
                "sim_fuzzywuzzy": sim_from_dist(d_lev, L),
                "sim_rapidfuzz":  sim_from_dist(d_rf, L),
                "sim_nltk":       sim_from_dist(d_nlk, L),
                "sim_difflib":    sim_df
            })
        except Exception as e:
            pass  # skip error

df_row_sim = pd.DataFrame(results)

# ===============================================================
# 5. Simpan ke Excel (jika diinginkan)
# ===============================================================
excel_path = "/content/rowwise_author_similarity.xlsx"
df_row_sim.to_excel(excel_path, index=False)
print("✅ Hasil similarity antar baris disimpan di:", excel_path)

# Tampilkan contoh
df_row_sim.head()


Total baris dengan nama penulis: 100


  0%|          | 0/100 [00:00<?, ?it/s]

✅ Hasil similarity antar baris disimpan di: /content/rowwise_author_similarity.xlsx


Unnamed: 0,row_1,row_2,author_1,author_2,length,sim_fuzzywuzzy,sim_rapidfuzz,sim_nltk,sim_difflib
0,0,25,"I Lewaa, MS Hafez, MA Ismail","H Liu, ZW Hu, Z Yang, M Wang",28,0.142857,0.142857,0.214286,0.285714
1,1,17,"M El Abassi, M Amnai, A Choukri, Y Fakhri…","C Wüstenhagen, K John, S Langner, M Brede…",42,0.071429,0.071429,0.238095,0.214286
2,2,30,"J Yang, K Xian, P Wang, Y Zhang","X Peng, H Liu, K Siggers, Z Liu",31,0.290323,0.290323,0.387097,0.354839
3,2,48,"J Yang, K Xian, P Wang, Y Zhang","WB Zhang, Y Ge, Y Leung, Y Zhou",31,0.129032,0.129032,0.483871,0.580645
4,2,53,"J Yang, K Xian, P Wang, Y Zhang","A Satpathy, MN Sahoo, L Behera…",31,0.096774,0.096774,0.193548,0.322581


In [None]:
# =====================================================
# 1. Install Library (sekali saja)
# =====================================================
!pip install -q rapidfuzz fuzzywuzzy python-Levenshtein nltk

# =====================================================
# 2. Import Library
# =====================================================
import pandas as pd
import Levenshtein
from rapidfuzz.distance import Hamming as rf_hamming
from fuzzywuzzy import fuzz
import nltk
nltk.download('punkt', quiet=True)
from nltk.metrics.distance import edit_distance
import difflib

# =====================================================
# 3. Fungsi bantu untuk similarity
# =====================================================
def sim_from_dist(dist, length):
    return 1.0 - dist / length if length > 0 else 0.0

def safe_hamming_similarity(a, b):
    if len(a) != len(b):
        return None
    return {
        "sim_fuzzywuzzy": sim_from_dist(Levenshtein.hamming(a, b), len(a)),
        "sim_rapidfuzz":  sim_from_dist(rf_hamming.distance(a, b), len(a)),
        "sim_nltk":       sim_from_dist(edit_distance(a, b, substitution_cost=1, transpositions=False), len(a)),
        "sim_difflib":    difflib.SequenceMatcher(None, a, b).ratio()
    }

# =====================================================
# 4. Load Data dan Lakukan Perbandingan
# =====================================================
# Contoh: Authors_1 tetap sama, Authors_2 berisi baris-baris pembanding
author_1 = "I Lewaa, MS Hafez, MA Ismail"
authors_2 = [
    "M El Abassi, M Amnai, A Choukri, Y Fakhri",
    "J Yang, K Xian, P Wang, Y Zhang",
    "J Yang, S Quan, P Wang",
    "Y Zhu, J Yang",
    "LL Sharabi",
    "DA Wood",
    "B Sun, T Alkhalifah",
    "T Lei, Z Lei",
    "J Yang, C Zhao, K Xian, A Zhu, Z Cao",
    "DA Wood"
    # Tambahkan lebih banyak jika perlu...
]

# =====================================================
# 5. Hasilkan Tabel Similarity
# =====================================================
results = []
for idx2, a2 in enumerate(authors_2):
    sim = safe_hamming_similarity(author_1, a2)
    results.append({
        "idx_1": 0,
        "idx_2": idx2 + 1,
        "Authors_1": author_1,
        "Authors_2": a2,
        "length_match": len(author_1) == len(a2),
        **(sim if sim else {
            "sim_fuzzywuzzy": None,
            "sim_rapidfuzz": None,
            "sim_nltk": None,
            "sim_difflib": difflib.SequenceMatcher(None, author_1, a2).ratio()
        })
    })

df_results = pd.DataFrame(results)
df_results.head(10)


Unnamed: 0,idx_1,idx_2,Authors_1,Authors_2,length_match,sim_fuzzywuzzy,sim_rapidfuzz,sim_nltk,sim_difflib
0,0,1,"I Lewaa, MS Hafez, MA Ismail","M El Abassi, M Amnai, A Choukri, Y Fakhri",False,,,,0.376812
1,0,2,"I Lewaa, MS Hafez, MA Ismail","J Yang, K Xian, P Wang, Y Zhang",False,,,,0.338983
2,0,3,"I Lewaa, MS Hafez, MA Ismail","J Yang, S Quan, P Wang",False,,,,0.44
3,0,4,"I Lewaa, MS Hafez, MA Ismail","Y Zhu, J Yang",False,,,,0.243902
4,0,5,"I Lewaa, MS Hafez, MA Ismail",LL Sharabi,False,,,,0.210526
5,0,6,"I Lewaa, MS Hafez, MA Ismail",DA Wood,False,,,,0.114286
6,0,7,"I Lewaa, MS Hafez, MA Ismail","B Sun, T Alkhalifah",False,,,,0.297872
7,0,8,"I Lewaa, MS Hafez, MA Ismail","T Lei, Z Lei",False,,,,0.4
8,0,9,"I Lewaa, MS Hafez, MA Ismail","J Yang, C Zhao, K Xian, A Zhu, Z Cao",False,,,,0.34375
9,0,10,"I Lewaa, MS Hafez, MA Ismail",DA Wood,False,,,,0.114286


In [None]:
# ===============================================================
# 0) Mount Google Drive
# ===============================================================
from google.colab import drive
drive.mount("/content/drive")     # Izinkan akses Drive Anda

# ===============================================================
# 1) Instal dependensi
# ===============================================================
!pip install --quiet rapidfuzz fuzzywuzzy python-Levenshtein nltk tqdm

# ===============================================================
# 2) Import & definisi utilitas
# ===============================================================
import pandas as pd, numpy as np, itertools, difflib, nltk
from tqdm.auto import tqdm
from rapidfuzz.distance import Hamming as rf_hamming
import Levenshtein

nltk.download("punkt", quiet=True)

# ---------- Hamming distance helper ---------- #
def ham_dist_lev(s1, s2):   return Levenshtein.hamming(s1, s2)
def ham_dist_rf(s1, s2):    return rf_hamming.distance(s1, s2)
def ham_dist_nltk(s1, s2):  return nltk.edit_distance(s1, s2, substitution_cost=1, transpositions=False)

# Convert distance → similarity ratio (0-1)
def sim_from_dist(d, length):  return 1.0 - d/length

# ===============================================================
# 3) Muat CSV
# ===============================================================
csv_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/PoPCites.csv"
df = pd.read_csv(csv_path)

# ===============================================================
# 4) Ekstrak seluruh nama penulis unik
# ===============================================================
author_list = []
for authors in df["Authors"].dropna():
    author_list.extend([a.strip() for a in authors.split(",")])

authors_unique = list(dict.fromkeys(author_list))   # buang duplikat
print(f"Total nama penulis unik: {len(authors_unique)}")

# ===============================================================
# 5) Hitung similarity pairwise (Hamming)
# ===============================================================
results = []
total_pairs = len(authors_unique)*(len(authors_unique)-1)//2
for s1, s2 in tqdm(itertools.combinations(authors_unique, 2), total=total_pairs, desc="pairwise"):
    if len(s1) != len(s2):      # skip jika panjang beda
        continue
    L = len(s1)
    try:
        d_lev = ham_dist_lev(s1, s2)
        d_rf  = ham_dist_rf(s1, s2)
        d_nlk = ham_dist_nltk(s1, s2)
        sim_diff = difflib.SequenceMatcher(a=s1, b=s2).ratio()   # langsung similarity

        results.append({
            "author_1"          : s1,
            "author_2"          : s2,
            "length"            : L,
            "sim_fuzzywuzzy"    : sim_from_dist(d_lev, L),
            "sim_rapidfuzz"     : sim_from_dist(d_rf,  L),
            "sim_nltk"          : sim_from_dist(d_nlk, L),
            "sim_difflib"       : sim_diff
        })
    except Exception as e:
        # Hampir tidak terjadi, tapi aman
        print("Skip:", s1, s2, "→", e)

df_sim = pd.DataFrame(results)
print("⏱️  Hitung selesai – pasangan tersimpan:", len(df_sim))

# ===============================================================
# 6) Simpan hasil ke Drive
# ===============================================================
out_path = "/content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarityNEW.csv"
df_sim.to_csv(out_path, index=False)
print("\n✅ File similarity tersimpan di:", out_path)

# ===============================================================
# 7) (Opsional) Tampilkan 10 baris contoh
# ===============================================================
df_sim.head(10)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total nama penulis unik: 275


pairwise:   0%|          | 0/37675 [00:00<?, ?it/s]

⏱️  Hitung selesai – pasangan tersimpan: 4852

✅ File similarity tersimpan di: /content/drive/MyDrive/Colab Notebooks/Integrasi Data/author_hamming_similarityNEW.csv


Unnamed: 0,author_1,author_2,length,sim_fuzzywuzzy,sim_rapidfuzz,sim_nltk,sim_difflib
0,I Lewaa,M Amnai,7,0.285714,0.285714,0.285714,0.285714
1,I Lewaa,Y Zhang,7,0.142857,0.142857,0.142857,0.285714
2,I Lewaa,P Wang…,7,0.142857,0.142857,0.142857,0.285714
3,I Lewaa,DA Wood,7,0.0,0.0,0.0,0.142857
4,I Lewaa,X Hong…,7,0.142857,0.142857,0.142857,0.142857
5,I Lewaa,A Alali,7,0.142857,0.142857,0.142857,0.285714
6,I Lewaa,V Kazei,7,0.142857,0.142857,0.142857,0.285714
7,I Lewaa,S Bader,7,0.142857,0.142857,0.142857,0.285714
8,I Lewaa,S Fomel,7,0.142857,0.142857,0.142857,0.285714
9,I Lewaa,M Raggi,7,0.142857,0.142857,0.142857,0.285714


In [None]:
!pip install rltk jellyfish "fuzzywuzzy[speedup]" rapidfuzz nltk pandas

