In [1]:
import csv
import os
import re
import pandas as pd

In [2]:
def read_csv(results_path="//store-01.hpi.uni-potsdam.de/fg/doellner/florence.boettger/counterfactuals/output/new_results/edits/batch"):
    dirs = [os.path.join(results_path, d) for d in os.listdir(results_path)]
    files = [os.path.join(d, "edits.csv") for d in dirs if "edits.csv" in os.listdir(d)]

    dict = {}
    classes = []

    for filename in files:
        name = re.split(r"\\", filename)[1]
        with open(filename, "r") as f:
            reader = list(csv.DictReader(f))
            classes.append(reader)
            for row in reader:
                dict[row["query_index"]] = (dict[row["query_index"]] if row["query_index"] in dict else {}) | {
                    name: {
                        "query_edit": row["query_edit"],
                        "distractor_edit": row["distractor_edit"]
                    }
                }

    return dict, classes

In [3]:
dict, classes = read_csv()

In [4]:
query_matches = 0
distractor_matches = 0
identical_matches = 0
partial_matches = 0

matches = {}

for query_index, d in dict.items():
    query_edits = [e["query_edit"] for e in list(d.values())]
    distractor_edits = [e["distractor_edit"] for e in list(d.values())]
    query_identical = query_edits.count(query_edits[0]) == len(query_edits)
    distractor_identical = distractor_edits.count(distractor_edits[0]) == len(distractor_edits)
    if query_identical:
        query_matches += 1
    if distractor_identical:
        distractor_matches += 1
    if query_identical and distractor_identical:
        identical_matches += 1        
        partial_matches += 1
        matches[query_index] = "identical"
    elif query_identical or distractor_identical:
        partial_matches += 1
        matches[query_index] = "partial"
    else:
        matches[query_index] = "none"

print(f"query match: {query_matches / len(dict)}")
print(f"distractor match: {distractor_matches / len(dict)}")
print(f"identical match: {identical_matches / len(dict)}")
print(f"partial match: {partial_matches / len(dict)}")

query match: 0.7129902516436182
distractor match: 0.5871684425300385
identical match: 0.4676944003627295
partial match: 0.8324642938109272


In [5]:
path = "//store-01.hpi.uni-potsdam.de/fg/doellner/florence.boettger/counterfactuals/output/new_results/edits/matches.csv"
with open(path, "w", newline='') as f:
    writer = csv.DictWriter(f, fieldnames=["query_index", "query_class", "match"])
    writer.writeheader()
    for query_index, m in matches.items():
        writer.writerow({
            "query_index": query_index,
            "query_class": [d["query_class"] for d in classes[0] if d["query_index"] == query_index][0],
            "match": m,
        })

In [6]:
data = pd.read_csv(path)
data

Unnamed: 0,query_index,query_class,match
0,0,0,identical
1,1,0,identical
2,4,0,partial
3,6,0,partial
4,7,0,identical
...,...,...,...
4406,5789,199,identical
4407,5790,199,partial
4408,5791,199,identical
4409,5792,199,identical


In [7]:
counts = data.groupby(["query_class", "match"]).size().reset_index(name="count")
counts

Unnamed: 0,query_class,match,count
0,0,identical,9
1,0,none,1
2,0,partial,12
3,1,identical,13
4,1,none,3
...,...,...,...
549,198,identical,12
550,198,none,5
551,198,partial,10
552,199,identical,16


In [8]:
counts.loc[counts["match"] == "none"].sort_values(by="count", ascending=False)[["query_class", "count"]]

Unnamed: 0,query_class,count
253,91,13
172,62,12
529,191,12
169,61,11
181,65,10
...,...,...
482,174,1
476,172,1
473,171,1
361,129,1


In [9]:
counts.loc[counts["match"] == "partial"].sort_values(by="count", ascending=False)[["query_class", "count"]]

Unnamed: 0,query_class,count
344,123,18
536,193,16
141,50,16
332,119,15
498,180,15
...,...,...
212,77,3
410,145,2
176,63,2
170,61,2


In [10]:
valid_matches = counts.loc[counts["match"] != "identical"][["query_class", "count"]].groupby("query_class").sum().sort_values(by="count", ascending=False).reset_index()
valid_matches

Unnamed: 0,query_class,count
0,191,25
1,91,24
2,62,24
3,123,21
4,193,21
...,...,...
183,147,5
184,155,4
185,5,4
186,31,4


In [16]:
relevant_matches = valid_matches.loc[valid_matches["count"] >= 20]
relevant_matches

Unnamed: 0,query_class,count
0,191,25
1,91,24
2,62,24
3,123,21
4,193,21
5,34,20
6,72,20


In [17]:
valid_matches.loc[(valid_matches["query_class"] >= 186) & (valid_matches["query_class"] <= 191)]

Unnamed: 0,query_class,count
0,191,25
16,187,18
32,190,16
115,188,11
163,186,8
180,189,6


In [19]:
relevant_path = "//store-01.hpi.uni-potsdam.de/fg/doellner/florence.boettger/counterfactuals/output/new_results/edits/relevant_matches.csv"
relevant_matches.to_csv(relevant_path, index=False)