# Identifying duplicate submissions

## Setup

In [1]:
import os
import sys
import itertools
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

In [2]:
# Import the dataloader
module_path = os.path.abspath(os.path.join("../04_dataset_access/"))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext autoreload
%autoreload 2
import evaluation
from dataloader import Dataloader # type: ignore

In [3]:
# Load the dataset with document chunking
df_doc = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
df_doc = Dataloader("document", n_jobs=12, tokenize=False).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", df_doc
)

# Load the dataset with page chunking
df_page = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
df_page = Dataloader("page", n_jobs=12).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", df_page
)
assert len(df_page) == 2282 + 299 - 13
df_page.head(1)

Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected,tokenized
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[equinet, welcome, the, opportunity, to, provi..."


In [4]:
# Match the internal ids of topic_modeling.ipynb
texts = pd.concat(
    (
        df_page.query("tokenized.str.len() > 0 or language_detected != 'en'"),
        df_page.query("tokenized.str.len() == 0 and language_detected == 'en'"),
    )
).reset_index(drop=True)

# Add some helpful columns
texts["page"] = texts.groupby("id").cumcount()
texts["is_feedback"] = texts["page"] == 0

## Identify duplicate candidates

In [5]:
# Compute potential duplicates on page and document level
duplicates_page = evaluation.duplicates(texts)
duplicates_doc = evaluation.duplicates(df_doc)

In [6]:
from evaluation import (
    _dataset_duplicates_page_false_positives as false_positives_page,
    _dataset_duplicates_page_true_positives as true_positives_page,
)

# Filter out true positives
df = duplicates_page
df = df[~df[["id_i", "id_j"]].apply(tuple, axis=1).isin(true_positives_page)]

# Filter out false positives or irrelevant
df = df[~df[["i", "j"]].apply(tuple, axis=1).isin(false_positives_page)]

# Display some relevant columns
cols_ = ["id_i", "id_j", "similarity", "organization_i", "organization_j"]
cols = ["i", "j"] + cols_ + ["is_feedback_i", "is_feedback_j"]
df[cols].sort_values("similarity", ascending=False).head()

Unnamed: 0,i,j,id_i,id_j,similarity,organization_i,organization_j,is_feedback_i,is_feedback_j
74,436,2164,2665595,2662492,0.530941,INFOBALT,Google,True,False
161,1408,2116,2665420,2662901,0.530808,European Savings and Retail Banking Group,European Association of Co-operative Banks,False,False
1,10,415,2665650,2665603,0.529833,AI Austria,European AI Forum,False,False
85,496,1192,2665578,2665462,0.528036,Women in AI Austria,Access Now Europe,False,False
156,1349,1719,2665431,2665231,0.527429,Związek Pracodawców Business & Science Poland,OpenAI,True,True


## Results

In [7]:
# Show the duplicates or adaptations we've identified on a page-level
df = duplicates_page
df = df[df[["id_i", "id_j"]].apply(tuple, axis=1).isin(true_positives_page)]
df.sort_values("similarity", ascending=False).drop_duplicates(["id_i", "id_j"])[
    ["similarity", "organization_i", "organization_j"]
]

Unnamed: 0,similarity,organization_i,organization_j
2,1.0,Digitalcourage e.V.,European Digital Rights (EDRi)
169,1.0,German Education Union (GEW),ETUCE
185,1.0,ETUCE,ČMOS PŠ
174,1.0,German Education Union (GEW),ČMOS PŠ
111,0.98534,Beltug,CIO Platform Nederland
97,0.895487,European DIGITAL SME Alliance,Small Business Standards (SBS)
166,0.862336,Biogen,European Federation of Pharmaceutical Industri...
197,0.802432,Vereinigung der Arbeitgeberverbände der Deutsc...,BDA - Bundesvereinigung der Deutschen Arbeitge...
47,0.769739,European Federation of Psychologists’ Associat...,European Test Publishers Group (ETPG)
171,0.767765,German Education Union (GEW),Teachers' Union of Ireland


In [8]:
# Filter out false positives or irrelevant
false_positives = [(60, 274), (95, 203)]
duplicates_doc = duplicates_doc[
    ~duplicates_doc[["i", "j"]].apply(tuple, axis=1).isin(false_positives)
]

# Display some relevant columns
duplicates_doc[cols_].sort_values("similarity", ascending=False)

Unnamed: 0,id_i,id_j,similarity,organization_i,organization_j
0,2665649,2665234,0.981435,Digitalcourage e.V.,European Digital Rights (EDRi)
7,2665205,2662780,0.959017,German Education Union (GEW),ČMOS PŠ
9,2663356,2663263,0.951901,Vereinigung der Arbeitgeberverbände der Deutsc...,BDA - Bundesvereinigung der Deutschen Arbeitge...
2,2665574,2665497,0.906641,European DIGITAL SME Alliance,Small Business Standards (SBS)
4,2665563,2665479,0.904971,Beltug,CIO Platform Nederland
8,2663486,2662780,0.614866,ETUCE,ČMOS PŠ
6,2665205,2663486,0.592789,German Education Union (GEW),ETUCE
1,2665586,2660588,0.502885,Philips,European Coordination Committee of the Radiolo...
