In [3]:
import pandas as pd
import recordlinkage as rl
from recordlinkage.preprocessing import clean
import numpy as np
from itertools import combinations

# Read your CSV file
df = pd.read_csv('clean_popcite.csv')

print(f"Dataset shape: {df.shape}")
print("Sample data:")
print(df[['Authors', 'Title', 'Source']].head())

# Data preprocessing - clean the fields
print("\nCleaning data...")

# First remove rows with NaN values in critical columns
df = df.dropna(subset=['Authors', 'Title', 'Source'])

# Then clean the remaining data
df['Authors_clean'] = clean(df['Authors'], 
                           lowercase=True, 
                           remove_brackets=True, 
                           strip_accents='unicode')

df['Title_clean'] = clean(df['Title'], 
                         lowercase=True, 
                         remove_brackets=True, 
                         strip_accents='unicode')

df['Source_clean'] = clean(df['Source'], 
                          lowercase=True, 
                          remove_brackets=True, 
                          strip_accents='unicode')

print("Data cleaning completed.")

clean_csv = df.to_csv("clean_popcite.csv")

print(f"clean dataframe saved to clean_popcite.csv!")

# INDEXATION STEP - Generate ALL possible pairs
print("\nGenerating all possible pairs...")
indexer = rl.Index()

# Use full indexing instead of blocking to get ALL pairs
indexer.full()

# Generate ALL candidate pairs for deduplication
candidate_links = indexer.index(df, df)

print(f"Generated {len(candidate_links)} candidate pairs (all possible combinations)")

# COMPARISON STEP
print("\nComparison step...")
compare_cl = rl.Compare()

# Add comparison methods for each field
compare_cl.string('Authors_clean', 'Authors_clean', 
                  method='cosine', threshold=0.5, label='authors')

compare_cl.string('Title_clean', 'Title_clean', 
                  method='cosine', threshold=0.5, label='title')

compare_cl.exact('Source_clean', 'Source_clean', label='source_exact')


# Optional: Add year comparison if needed
# if 'Year' in df.columns:
#     compare_cl.exact('Year', 'Year', label='year')

# Compute comparison features for all candidate pairs
print("Computing comparisons... This may take a while for large datasets.")
features = compare_cl.compute(candidate_links, df, df)
features = features[features.index.get_level_values(0) != features.index.get_level_values(1)]
print(f"Comparisons after removing self-matches: {len(features)}")

# Save all comparison results to CSV
features.reset_index().to_csv('all_pairwise_comparisons.csv', index=False)
print(f"All pairwise comparison results saved to 'all_pairwise_comparisons.csv'")

print(f"\nComparison completed!")
print(f"Total records: {len(df)}")
print(f"Total pairwise comparisons: {len(features)}")
print(f"Total pairwise matches (rows in features): {features.shape[0]}")
matches = features[features.sum(axis=1) > 2]
print(len(matches))
print(matches.head())
# Collect all matched pairs into a single DataFrame and save as one CSV
matched_pairs = []
for idx, row in matches.iterrows():
    idx1, idx2 = row.name  # MultiIndex: (index1, index2)
    matched_pairs.append({
        'Record1_Index': idx1,
        'Record2_Index': idx2,
        'Record1_Authors': df.loc[idx1, 'Authors'],
        'Record1_Title': df.loc[idx1, 'Title'],
        'Record1_Source': df.loc[idx1, 'Source'],
        'Record2_Authors': df.loc[idx2, 'Authors'],
        'Record2_Title': df.loc[idx2, 'Title'],
        'Record2_Source': df.loc[idx2, 'Source']
    })
matched_pairs_df = pd.DataFrame(matched_pairs)
matched_pairs_df.to_csv('all_matched_pairs.csv', index=False)
print(f"All matched pairs saved to 'all_matched_pairs.csv'")

matches.reset_index().to_csv('matches.csv', index=False)
print(f"Potential matches saved to 'matches.csv'")

Dataset shape: (97, 30)
Sample data:
                                      Authors  \
0                I Lewaa, MS Hafez, MA Ismail   
1  M El Abassi, M Amnai, A Choukri, Y Fakhri…   
2             J Yang, K Xian, P Wang, Y Zhang   
3                     J Yang, S Quan, P Wang…   
4                               Y Zhu, J Yang   

                                               Title  \
0  Data integration using statistical matching te...   
1  Matching data detection for the integration sy...   
2  A performance evaluation of correspondence gro...   
3  Evaluating local geometric feature representat...   
4  Automatic data matching for geospatial models:...   

                            Source  
0  Statistical Journal of the IAOS  
1       International Journal of …  
2   IEEE transactions on pattern …  
3           IEEE Transactions on …  
4                    Annals of GIS  

Cleaning data...
Data cleaning completed.
clean dataframe saved to clean_popcite.csv!

Generating all possib



Generated 9409 candidate pairs (all possible combinations)

Comparison step...
Computing comparisons... This may take a while for large datasets.
Comparisons after removing self-matches: 9312
All pairwise comparison results saved to 'all_pairwise_comparisons.csv'

Comparison completed!
Total records: 97
Total pairwise comparisons: 9312
Total pairwise matches (rows in features): 9312
2
       authors  title  source_exact
48 83      1.0    1.0             1
83 48      1.0    1.0             1
All matched pairs saved to 'all_matched_pairs.csv'
Potential matches saved to 'matches.csv'


In [4]:
# ECM CLASSIFICATION STEP
print("\n" + "="*50)
print("STARTING ECM CLASSIFICATION")
print("="*50)

# Prepare the feature matrix for ECM
print("\nPreparing feature matrix for ECM...")

# Convert comparison results to binary features (0 or 1)
# The ECM algorithm works with binary comparison vectors
X_data = features.astype(int)

print(f"Feature matrix shape: {X_data.shape}")
print(f"Feature columns: {list(X_data.columns)}")
print(f"Sample of feature matrix:")
print(X_data.head())

# Check for any missing values
if X_data.isnull().sum().sum() > 0:
    print("Warning: Found missing values in feature matrix. Filling with 0.")
    X_data = X_data.fillna(0)

# Initialize the ECM Classifier
print("\nInitializing ECM Classifier...")
ecm_classifier = rl.ECMClassifier()

# Fit the ECM model (unsupervised learning)
print("Training ECM model... This may take some time.")
ecm_classifier.fit(X_data)

print("\nECM Training completed!")

# Print the learned parameters
print("\n" + "-"*40)
print("LEARNED ECM PARAMETERS")
print("-"*40)
print(f"Prior probability P(Match): {ecm_classifier.p:.4f}")
print(f"m probabilities P(x_i=1|Match): {ecm_classifier.m_probs}")
print(f"u probabilities P(x_i=1|Non-Match): {ecm_classifier.u_probs}")
print(f"Feature weights: {ecm_classifier.weights}")

# Make predictions using the trained ECM model
print("\n" + "-"*40)
print("MAKING PREDICTIONS")
print("-"*40)

# Predict matches
print("Predicting matches...")
ecm_links = ecm_classifier.predict(X_data)
print(f"ECM predicted {len(ecm_links)} matches out of {len(X_data)} candidate pairs")

# Get match probabilities for all pairs
print("Computing match probabilities...")
match_probabilities = ecm_classifier.prob(X_data)

# Create a comprehensive results DataFrame
print("\nCreating detailed results...")
ecm_results = []

for idx, prob in zip(X_data.index, match_probabilities):
    idx1, idx2 = idx
    is_predicted_match = idx in ecm_links
    
    ecm_results.append({
        'Record1_Index': idx1,
        'Record2_Index': idx2,
        'Record1_Authors': df.loc[idx1, 'Authors'],
        'Record1_Title': df.loc[idx1, 'Title'],
        'Record1_Source': df.loc[idx1, 'Source'],
        'Record2_Authors': df.loc[idx2, 'Authors'],
        'Record2_Title': df.loc[idx2, 'Title'],
        'Record2_Source': df.loc[idx2, 'Source'],
        'Match_Probability': prob,
        'ECM_Prediction': is_predicted_match,
        'Authors_Similarity': X_data.loc[idx, 'authors'] if 'authors' in X_data.columns else 0,
        'Title_Similarity': X_data.loc[idx, 'title'] if 'title' in X_data.columns else 0,
        'Source_Exact_Match': X_data.loc[idx, 'source_exact'] if 'source_exact' in X_data.columns else 0
    })

ecm_results_df = pd.DataFrame(ecm_results)

# Sort by match probability (highest first)
ecm_results_df = ecm_results_df.sort_values('Match_Probability', ascending=False)

# Save all results
ecm_results_df.to_csv('ecm_all_results.csv', index=False)
print(f"All ECM results saved to 'ecm_all_results.csv'")

# Save only the predicted matches
ecm_matches_df = ecm_results_df[ecm_results_df['ECM_Prediction'] == True]
ecm_matches_df.to_csv('ecm_predicted_matches.csv', index=False)
print(f"ECM predicted matches saved to 'ecm_predicted_matches.csv'")

# High-confidence matches (probability > 0.8)
high_conf_matches = ecm_results_df[ecm_results_df['Match_Probability'] > 0.8]
high_conf_matches.to_csv('ecm_high_confidence_matches.csv', index=False)
print(f"High confidence matches (>0.8) saved to 'ecm_high_confidence_matches.csv'")

print("\n" + "="*50)
print("ECM CLASSIFICATION SUMMARY")
print("="*50)
print(f"Total candidate pairs evaluated: {len(X_data)}")
print(f"ECM predicted matches: {len(ecm_matches_df)}")
print(f"High confidence matches (>0.8): {len(high_conf_matches)}")
print(f"Medium confidence matches (0.5-0.8): {len(ecm_results_df[(ecm_results_df['Match_Probability'] > 0.5) & (ecm_results_df['Match_Probability'] <= 0.8)])}")
print(f"Low confidence matches (<0.5): {len(ecm_results_df[ecm_results_df['Match_Probability'] <= 0.5])}")

# Show some statistics about match probabilities
print(f"\nMatch Probability Statistics:")
print(f"Mean: {ecm_results_df['Match_Probability'].mean():.4f}")
print(f"Median: {ecm_results_df['Match_Probability'].median():.4f}")
print(f"Min: {ecm_results_df['Match_Probability'].min():.4f}")
print(f"Max: {ecm_results_df['Match_Probability'].max():.4f}")

# Display top 10 most likely matches
print(f"\nTop 10 Most Likely Matches:")
print("-" * 80)
for idx, row in ecm_matches_df.head(10).iterrows():
    print(f"Probability: {row['Match_Probability']:.4f}")
    print(f"  Record 1: {row['Record1_Authors']} - {row['Record1_Title']}")
    print(f"  Record 2: {row['Record2_Authors']} - {row['Record2_Title']}")
    print(f"  Similarities - Authors: {row['Authors_Similarity']}, Title: {row['Title_Similarity']}, Source: {row['Source_Exact_Match']}")
    print("-" * 80)

# Optional: Create different threshold-based classifications
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
print(f"\nMatches at different probability thresholds:")
for threshold in thresholds:
    count = len(ecm_results_df[ecm_results_df['Match_Probability'] > threshold])
    print(f"  Threshold > {threshold}: {count} matches")

print(f"\nECM Classification completed! Check the generated CSV files for detailed results.")


STARTING ECM CLASSIFICATION

Preparing feature matrix for ECM...
Feature matrix shape: (9312, 3)
Feature columns: ['authors', 'title', 'source_exact']
Sample of feature matrix:
     authors  title  source_exact
0 1        0      1             0
  2        0      1             0
  3        0      1             0
  4        0      1             0
  5        0      1             0

Initializing ECM Classifier...
Training ECM model... This may take some time.

ECM Training completed!

----------------------------------------
LEARNED ECM PARAMETERS
----------------------------------------
Prior probability P(Match): 0.0268
m probabilities P(x_i=1|Match): {'authors': {np.int64(0): np.float64(0.43046841777582595), np.int64(1): np.float64(0.5695315822241744)}, 'title': {np.int64(0): np.float64(0.10476017857410284), np.int64(1): np.float64(0.8952398214258974)}, 'source_exact': {np.int64(0): np.float64(0.9808389900721138), np.int64(1): np.float64(0.019161009927886597)}}
u probabilities P(x_i=1|