In [10]:
import os
import pandas as pd
import re

# Paths
base_dir = "rounds"

In [11]:
def load_and_tag_rounds(folder):
    dfs = []
    for fname in os.listdir(folder):
        round_match = re.match(r'round_(\d+)\.csv', fname)
        if round_match:
            round_num = int(round_match.group(1))
            col_name = f"round_{round_num}"
            df = pd.read_csv(os.path.join(folder, fname))
            df = df[['doi', 'included']].copy()
            df[col_name] = df['included'].apply(lambda x: 1 if x == 1 else 0)
            df = df[['doi', col_name]]
            dfs.append(df)
    if dfs:
        result = dfs[0]
        for d in dfs[1:]:
            result = pd.merge(result, d, on='doi', how='outer')
        return result
    else:
        return pd.DataFrame(columns=['doi'])

# Load and tag all rounds
all_rounds_df = load_and_tag_rounds(base_dir)

# Optionally, merge with metadata from round_1.csv
meta_path = os.path.join(base_dir, "round_1.csv")
if os.path.exists(meta_path):
    meta_df = pd.read_csv(meta_path)
    meta_cols = [col for col in meta_df.columns if col not in all_rounds_df.columns and col != 'included']
    all_rounds_df = all_rounds_df.merge(meta_df[['doi'] + meta_cols].drop_duplicates(), on='doi', how='left')

# Fill NaN with 0 (not marked relevant in that round)
for col in all_rounds_df.columns:
    if col.startswith('round_'):
        all_rounds_df[col] = all_rounds_df[col].fillna(0).astype(int)


In [12]:
# Show counts for each round
for col in all_rounds_df.columns:
    if col.startswith('round_'):
        print(f"{col}: {all_rounds_df[col].sum()} relevant")

# Count the total number of unique DOIs
print(f"Total unique DOIs: {len(all_rounds_df['doi'].unique())}")


round_2: 25 relevant
round_1: 25 relevant
Total unique DOIs: 403


In [13]:
output_file = os.path.join(base_dir, "combined_results.csv")
all_rounds_df.to_csv(output_file, index=False)
print(f"Combined dataset saved as {output_file}")

Combined dataset saved as rounds/combined_results.csv
