In [1]:
pip install splink duckdb

Collecting splink
  Downloading splink-4.0.8-py3-none-any.whl.metadata (12 kB)
Collecting igraph>=0.11.2 (from splink)
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting texttable>=1.6.2 (from igraph>=0.11.2->splink)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading splink-4.0.8-py3-none-any.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph, splink
Successfully installed igraph-0.11.9 splink-4.0.8 texttable-1.7.0


In [42]:
import pandas as pd
from splink import Linker, SettingsCreator, block_on, DuckDBAPI
from splink.comparison_library import ExactMatch, JaroWinklerAtThresholds
from splink.blocking_rule_library import CustomRule

# 1. Muat dataset
df = pd.read_csv("clean_popcite.csv")
df

Unnamed: 0,Authors,Title,Source
0,"I Lewaa, MS Hafez, MA Ismail",Data integration using statistical matching te...,Statistical Journal of the IAOS
1,"M El Abassi, M Amnai, A Choukri, Y Fakhri…",Matching data detection for the integration sy...,International Journal of …
2,"J Yang, K Xian, P Wang, Y Zhang",A performance evaluation of correspondence gro...,IEEE transactions on pattern …
3,"J Yang, S Quan, P Wang…",Evaluating local geometric feature representat...,IEEE Transactions on …
4,"Y Zhu, J Yang",Automatic data matching for geospatial models:...,Annals of GIS
...,...,...,...
92,"S Auzoux, B Ngaba, M Christina, B Heuclin, M R...",Experimental variables in sugarcane intercropp...,Data in Brief
93,"A Coppens, V Maquil",Skeletal Data Matching and Merging from Multip...,"… 2024, Valencia, Spain, September 15–18 …"
94,"A Kato, H Wakabayashi, M Bradford…",Accurate ground positioning obtained from 3d d...,IGARSS 2019-2019 …
95,"UV Albrecht, D Lawin, S Kuhn…",Time bias awareness in ECG-Based multiple sour...,Advances in Informatics …


In [43]:
# prompt: i want to make identifier based on index on df

# Assuming 'df' is your DataFrame from the previous cell
df['unique_id'] = (df.index + 1)
df

Unnamed: 0,Authors,Title,Source,unique_id
0,"I Lewaa, MS Hafez, MA Ismail",Data integration using statistical matching te...,Statistical Journal of the IAOS,1
1,"M El Abassi, M Amnai, A Choukri, Y Fakhri…",Matching data detection for the integration sy...,International Journal of …,2
2,"J Yang, K Xian, P Wang, Y Zhang",A performance evaluation of correspondence gro...,IEEE transactions on pattern …,3
3,"J Yang, S Quan, P Wang…",Evaluating local geometric feature representat...,IEEE Transactions on …,4
4,"Y Zhu, J Yang",Automatic data matching for geospatial models:...,Annals of GIS,5
...,...,...,...,...
92,"S Auzoux, B Ngaba, M Christina, B Heuclin, M R...",Experimental variables in sugarcane intercropp...,Data in Brief,93
93,"A Coppens, V Maquil",Skeletal Data Matching and Merging from Multip...,"… 2024, Valencia, Spain, September 15–18 …",94
94,"A Kato, H Wakabayashi, M Bradford…",Accurate ground positioning obtained from 3d d...,IGARSS 2019-2019 …,95
95,"UV Albrecht, D Lawin, S Kuhn…",Time bias awareness in ECG-Based multiple sour...,Advances in Informatics …,96


In [44]:
# 2. Bangun daftar ComparisonCreator secara dinamis
comparisons = []
for col in ["Authors", "Title", "Source"]:
    # Satu comparison dengan tiga level: exact, ≥0.85, else
    comparisons.append(
        JaroWinklerAtThresholds(
            col_name=col,
            score_threshold_or_thresholds=[0.85]
        )
    )

# Untuk unique_id, gunakan ExactMatch saja
comparisons.append( ExactMatch(col_name="unique_id") )

In [45]:
# 3. Inisialisasi SettingsCreator dengan semua comparisons sekaligus
full_block = CustomRule("1 = 1")
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=comparisons,
    blocking_rules_to_generate_predictions=[full_block]
)

In [46]:
# 4. Inisialisasi Linker dengan backend DuckDB
linker = Linker(
    df,
    settings,
    db_api=DuckDBAPI()
)

In [47]:
# 5. Pelatihan EM pada semua pasangan
linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=42)
em_session = linker.training.estimate_parameters_using_expectation_maximisation(
    full_block,
    estimate_without_term_frequencies=False,
    fix_u_probabilities=False
)

INFO:splink.internals.estimate_u:----- Estimating u probabilities using random sampling -----
INFO:splink.internals.m_u_records_to_parameters:u probability not trained for Title - Exact match on Title (comparison vector value: 2). This usually means the comparison level was never observed in the training data.
INFO:splink.internals.m_u_records_to_parameters:u probability not trained for unique_id - Exact match on unique_id (comparison vector value: 1). This usually means the comparison level was never observed in the training data.
INFO:splink.internals.estimate_u:
Estimated u probabilities using random sampling
INFO:splink.internals.settings:
Your model is not yet fully trained. Missing estimates for:
    - Authors (no m values are trained).
    - Title (some u values are not trained, no m values are trained).
    - Source (no m values are trained).
    - unique_id (some u values are not trained, no m values are trained).
INFO:splink.internals.em_training_session:
----- Starting EM tr

In [48]:
# 6. Jalankan probabilistic linkage (EM algorithm)
inference_result = linker.inference.predict()

# 7. Konversi hasil ke pandas DataFrame
df_results = inference_result.as_pandas_dataframe()

INFO:splink.internals.linker_components.inference:Blocking time: 0.00 seconds
INFO:splink.internals.linker_components.inference:Predict time: 0.10 seconds
You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'Title':
    m values not fully trained
Comparison: 'Title':
    u values not fully trained
Comparison: 'unique_id':
    m values not fully trained
Comparison: 'unique_id':
    u values not fully trained
The 'probability_two_random_records_match' setting has been set to the default value (0.0001). 
If this is not the desired behaviour, either: 
 - assign a value for `probability_two_random_records_match` in your settings dictionary, or 
 - estimate with the `linker.estimate_probability_two_random_records_match` function.


In [49]:
df_results

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,Authors_l,Authors_r,gamma_Authors,Title_l,Title_r,gamma_Title,Source_l,Source_r,gamma_Source,gamma_unique_id
0,-70.794023,4.885123e-22,1,2,"I Lewaa, MS Hafez, MA Ismail","M El Abassi, M Amnai, A Choukri, Y Fakhri…",0,Data integration using statistical matching te...,Matching data detection for the integration sy...,0,Statistical Journal of the IAOS,International Journal of …,0,0
1,-70.794023,4.885123e-22,1,3,"I Lewaa, MS Hafez, MA Ismail","J Yang, K Xian, P Wang, Y Zhang",0,Data integration using statistical matching te...,A performance evaluation of correspondence gro...,0,Statistical Journal of the IAOS,IEEE transactions on pattern …,0,0
2,-70.794023,4.885123e-22,2,3,"M El Abassi, M Amnai, A Choukri, Y Fakhri…","J Yang, K Xian, P Wang, Y Zhang",0,Matching data detection for the integration sy...,A performance evaluation of correspondence gro...,0,International Journal of …,IEEE transactions on pattern …,0,0
3,-70.794023,4.885123e-22,1,4,"I Lewaa, MS Hafez, MA Ismail","J Yang, S Quan, P Wang…",0,Data integration using statistical matching te...,Evaluating local geometric feature representat...,0,Statistical Journal of the IAOS,IEEE Transactions on …,0,0
4,-70.794023,4.885123e-22,2,4,"M El Abassi, M Amnai, A Choukri, Y Fakhri…","J Yang, S Quan, P Wang…",0,Matching data detection for the integration sy...,Evaluating local geometric feature representat...,0,International Journal of …,IEEE Transactions on …,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,-70.794023,4.885123e-22,92,97,"X Luo, L Zhang, L Ren, Y Lali",DA Wood,0,A dynamic and static data based matching metho...,Solar plus wind country-wide electrical power ...,0,Robotics and Computer-Integrated …,International Journal of Energy and Environmen...,0,0
4652,-70.794023,4.885123e-22,93,97,"S Auzoux, B Ngaba, M Christina, B Heuclin, M R...",DA Wood,0,Experimental variables in sugarcane intercropp...,Solar plus wind country-wide electrical power ...,0,Data in Brief,International Journal of Energy and Environmen...,0,0
4653,-70.794023,4.885123e-22,94,97,"A Coppens, V Maquil",DA Wood,0,Skeletal Data Matching and Merging from Multip...,Solar plus wind country-wide electrical power ...,0,"… 2024, Valencia, Spain, September 15–18 …",International Journal of Energy and Environmen...,0,0
4654,-70.794023,4.885123e-22,95,97,"A Kato, H Wakabayashi, M Bradford…",DA Wood,0,Accurate ground positioning obtained from 3d d...,Solar plus wind country-wide electrical power ...,0,IGARSS 2019-2019 …,International Journal of Energy and Environmen...,0,0


In [50]:
# prompt: sort the "df_results" dataframe by the "match_probability" column values descending

df_results_sorted = df_results.sort_values(by='match_probability', ascending=False)
df_results_sorted

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,Authors_l,Authors_r,gamma_Authors,Title_l,Title_r,gamma_Title,Source_l,Source_r,gamma_Source,gamma_unique_id
700,12.645623,9.998440e-01,35,38,"M Graf, L Laskowski, F Papsdorf…","M Graf, L Laskowski, F Papsdorf, F Sold…",1,Frost: Benchmarking and exploring data matchin...,Frost: a platform for benchmarking and explori...,1,arXiv preprint …,arXiv preprint arXiv …,1,0
5,2.893353,8.813754e-01,3,4,"J Yang, K Xian, P Wang, Y Zhang","J Yang, S Quan, P Wang…",1,A performance evaluation of correspondence gro...,Evaluating local geometric feature representat...,0,IEEE transactions on pattern …,IEEE Transactions on …,1,0
552,-12.610135,1.599196e-04,25,34,"R D'Alberto, M Raggi","R D'Alberto, M Raggi",2,Integrating rather than collecting: statistica...,From collection to integration: Non-parametric...,0,Statistical Papers,Statistical Journal of the IAOS,1,0
3098,-21.180426,4.207810e-07,18,80,"C Wüstenhagen, K John, S Langner, M Brede…","C Wüstenhagen, C Domnick…",1,CFD validation using in-vitro MRI velocity dat...,MRI investigations of internal blade cooling f...,0,Computers in Biology …,… Expo: Power for …,0,0
1787,-21.180426,4.207810e-07,18,61,"C Wüstenhagen, K John, S Langner, M Brede…","C Wüstenhagen, C Domnick…",1,CFD validation using in-vitro MRI velocity dat...,… Resonance Velocimetry Measurements of Intern...,0,Computers in Biology …,Journal of …,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,-291.368965,1.946264e-88,1,34,"I Lewaa, MS Hafez, MA Ismail","R D'Alberto, M Raggi",0,Data integration using statistical matching te...,From collection to integration: Non-parametric...,0,Statistical Journal of the IAOS,Statistical Journal of the IAOS,2,0
438,-291.368965,1.946264e-88,4,31,"J Yang, S Quan, P Wang…","X Peng, H Liu, K Siggers, Z Liu",0,Evaluating local geometric feature representat...,Automated box data matching for multi-modal ma...,0,IEEE Transactions on …,IEEE Transactions on …,2,0
454,-291.368965,1.946264e-88,20,31,"W Xue, D Vatsalan, W Hu…","X Peng, H Liu, K Siggers, Z Liu",0,Sequence data matching and beyond: New privacy...,Automated box data matching for multi-modal ma...,0,IEEE Transactions on …,IEEE Transactions on …,2,0
174,-291.368965,1.946264e-88,4,20,"J Yang, S Quan, P Wang…","W Xue, D Vatsalan, W Hu…",0,Evaluating local geometric feature representat...,Sequence data matching and beyond: New privacy...,0,IEEE Transactions on …,IEEE Transactions on …,2,0


In [51]:
# prompt: sort "df_results" by "unique_id_l" to int asc then by "unique_id_r" to int asc

df_results_sorted_ids = df_results.sort_values(
    by=['unique_id_l', 'unique_id_r']
)
df_results_sorted_ids

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,Authors_l,Authors_r,gamma_Authors,Title_l,Title_r,gamma_Title,Source_l,Source_r,gamma_Source,gamma_unique_id
0,-70.794023,4.885123e-22,1,2,"I Lewaa, MS Hafez, MA Ismail","M El Abassi, M Amnai, A Choukri, Y Fakhri…",0,Data integration using statistical matching te...,Matching data detection for the integration sy...,0,Statistical Journal of the IAOS,International Journal of …,0,0
1,-70.794023,4.885123e-22,1,3,"I Lewaa, MS Hafez, MA Ismail","J Yang, K Xian, P Wang, Y Zhang",0,Data integration using statistical matching te...,A performance evaluation of correspondence gro...,0,Statistical Journal of the IAOS,IEEE transactions on pattern …,0,0
3,-70.794023,4.885123e-22,1,4,"I Lewaa, MS Hafez, MA Ismail","J Yang, S Quan, P Wang…",0,Data integration using statistical matching te...,Evaluating local geometric feature representat...,0,Statistical Journal of the IAOS,IEEE Transactions on …,0,0
6,-70.794023,4.885123e-22,1,5,"I Lewaa, MS Hafez, MA Ismail","Y Zhu, J Yang",0,Data integration using statistical matching te...,Automatic data matching for geospatial models:...,0,Statistical Journal of the IAOS,Annals of GIS,0,0
10,-70.794023,4.885123e-22,1,6,"I Lewaa, MS Hafez, MA Ismail",LL Sharabi,0,Data integration using statistical matching te...,Finding love on a first data: Matching algorit...,0,Statistical Journal of the IAOS,Harvard Data Science Review,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4558,-70.794023,4.885123e-22,94,96,"A Coppens, V Maquil","UV Albrecht, D Lawin, S Kuhn…",0,Skeletal Data Matching and Merging from Multip...,Time bias awareness in ECG-Based multiple sour...,0,"… 2024, Valencia, Spain, September 15–18 …",Advances in Informatics …,0,0
4653,-70.794023,4.885123e-22,94,97,"A Coppens, V Maquil",DA Wood,0,Skeletal Data Matching and Merging from Multip...,Solar plus wind country-wide electrical power ...,0,"… 2024, Valencia, Spain, September 15–18 …",International Journal of Energy and Environmen...,0,0
4559,-70.794023,4.885123e-22,95,96,"A Kato, H Wakabayashi, M Bradford…","UV Albrecht, D Lawin, S Kuhn…",0,Accurate ground positioning obtained from 3d d...,Time bias awareness in ECG-Based multiple sour...,0,IGARSS 2019-2019 …,Advances in Informatics …,0,0
4654,-70.794023,4.885123e-22,95,97,"A Kato, H Wakabayashi, M Bradford…",DA Wood,0,Accurate ground positioning obtained from 3d d...,Solar plus wind country-wide electrical power ...,0,IGARSS 2019-2019 …,International Journal of Energy and Environmen...,0,0


In [52]:
# Save the DataFrame to an Excel file
df_results_sorted_ids.to_excel('authors_title_source_sim_res.xlsx', index=False)