In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score)
from extractor import classify_replication 

In [2]:
df = pd.read_csv('sample_abstracts.csv')

# Clean dataframe: Keep relevant columns and remove rows with missing value
df_clean = df[['doi_r', 'abstract_r', 'outcome', 'outcome_quote']].copy()
df_clean = df_clean.dropna(subset=['doi_r', 'abstract_r', 'outcome'])

In [3]:
# Keep only rows with valid outcomes
valid_outcomes = ['successful', 'failed', 'mixed']
df_clean = df_clean[df_clean['outcome'].isin(valid_outcomes)].copy()

In [4]:
df_clean.head()

Unnamed: 0,doi_r,abstract_r,outcome,outcome_quote
0,10.1007/s40732-024-00601-4,\nResponse disequilibrium theory suggests that...,successful,Results of the current study align with prior ...
1,10.1371/journal.pone.0313619,"A core feature of eating disorders, such as an...",successful,Low-BMI control participants overestimated the...
2,10.1080/10926771.2022.2038753,A wealth of research has consistently identifi...,successful,We aimed to replicate findings that individual...
3,10.1080/10926771.2022.2038753,A wealth of research has consistently identifi...,successful,We aimed to replicate findings that individual...
4,10.1177/0956797617734315,Abel and Kruger (2010) found that the smile in...,failed,"In both samples and for all three indicators, ..."


In [5]:
print(df_clean['outcome'].value_counts())

outcome
failed        96
successful    60
mixed         25
Name: count, dtype: int64


In [6]:
# Store predictions
predictions = []
model_outcomes = []
model_quotes = []
errors = []

In [7]:
for idx, row in df_clean.iterrows():
    try:
        doi = row['doi_r']
        abstract = row['abstract_r']
        
        # Run the classification function
        result = classify_replication(abstract)
        
        if result:
            predictions.append({
                'doi_r': doi,
                'predicted_outcome': result.outcome.value,
                'predicted_quote': result.proof,
                'confidence': result.confidence,
                'actual_outcome': row['outcome']
            })
            print(f"✓ {idx+1}. {doi}")
        else:
            errors.append({'doi_r': doi, 'error': 'Classification returned None'})
            print(f"✗ {idx+1}. {doi} - Classification failed")
            
    except Exception as e:
        errors.append({'doi_r': row['doi_r'], 'error': str(e)})
        print(f"✗ {idx+1}. {row['doi_r']} - Error: {str(e)}")

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

✓ 1. 10.1007/s40732-024-00601-4
✓ 2. 10.1371/journal.pone.0313619
✓ 3. 10.1080/10926771.2022.2038753
✓ 4. 10.1080/10926771.2022.2038753
✓ 5. 10.1177/0956797617734315
✓ 6. 10.1002/ejsp.2748
✓ 8. 10.1111/padm.12860
✓ 9. 10.1111/desc.13244
✓ 10. 10.3758/s13423-013-0549-2
✓ 11. 10.3758/s13423-013-0549-2
✓ 12. 10.1027/1864-1105/a000334
✓ 13. 10.1027/1618-3169/a000567
✓ 14. 10.1027/1618-3169/a000567
✓ 15. 10.1027/1618-3169/a000567
✓ 16. 10.3758/s13423-024-02602-4
✓ 17. 10.3758/s13423-024-02602-4
✓ 18. 10.1002/pros.21320
✓ 19. 10.1017/S0272263124000238
✓ 20. 10.1038/s41467-024-46936-y
✓ 21. 10.1093/applin/amae042
✓ 22. 10.1007/s00426-019-01203-4
✓ 23. 10.1017/XPS.2017.8
✓ 24. 10.1002/erv.3103
✓ 26. 10.1007/s11251-024-09670-y
✓ 27. 10.1007/s11251-024-09670-y
✓ 28. 10.1016/j.jom.2005.11.004
✓ 29. 10.1002/jeab.915
✓ 30. 10.17605/OSF.IO/RGE23
✓ 31. 10.1027/1864-9335/a000178
✗ 32. 10.1027/1864-9335/a000178 - Classification failed
✓ 33. 10.1027/1864-9335/a000178
✓ 34. 10.1027/1864-9335/a000178
✓ 35

MODEL EVALUATION

In [8]:
predictions_df.head()

Unnamed: 0,doi_r,predicted_outcome,predicted_quote,confidence,actual_outcome
0,10.1007/s40732-024-00601-4,successful,Results of the current study align with prior ...,high,successful
1,10.1371/journal.pone.0313619,mixed,Low-BMI control participants overestimated the...,high,successful
2,10.1080/10926771.2022.2038753,successful,We aimed to replicate findings that individual...,high,successful
3,10.1080/10926771.2022.2038753,successful,We aimed to replicate findings that individual...,high,successful
4,10.1177/0956797617734315,failed,"In both samples and for all three indicators, ...",high,failed


In [9]:
y_true = predictions_df['actual_outcome'].values
y_pred = predictions_df['predicted_outcome'].values

In [12]:
accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Recall (macro): {recall_macro:.4f}")
print(f"F1-Score (macro): {f1_macro:.4f}")

Accuracy: 0.7654
Precision (macro): 0.7769
Recall (macro): 0.8102
F1-Score (macro): 0.7392


In [13]:
print("\n" + "-"*80)
print("CLASSIFICATION REPORT")
print("-"*80)
print(classification_report(y_true, y_pred, labels=['successful', 'failed', 'mixed'], zero_division=0))


--------------------------------------------------------------------------------
CLASSIFICATION REPORT
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

  successful       0.96      0.74      0.83        58
      failed       1.00      0.73      0.84        96
       mixed       0.38      0.96      0.54        25

    accuracy                           0.77       179
   macro avg       0.78      0.81      0.74       179
weighted avg       0.90      0.77      0.80       179

