# Readers

> Readers are responsible for loading evaluation repositories and providing a unified interface for accessing their contents.

In [None]:
#| default_exp readers

In [None]:
#| export
from pathlib import Path
from rich import print
from abc import ABC, abstractmethod
import json
import hashlib

import pandas as pd
from fastcore.all import *

The `EvalReader` interface defines a common contract that all evaluation repository readers (e.g. IOM, UNHCR) must implement:

In [None]:
#| exports
class EvalReader(ABC):
    def __init__(self, cfg): 
        self.cfg = cfg
    
    @abstractmethod
    def read(self): pass
    
    @abstractmethod
    def tfm(self, df): pass
    
    @abstractmethod
    def to_json(self, output_path): pass
    
    def __call__(self):
        df = self.read()
        return self.tfm(df)

## IOM Reader

In [None]:
#| exports
def iom_input_cfg():
    return {
        'sheet_name': 'extract from 2005 to Aug 2024',
        'date_cols': ['Date of Publication', 'Evaluation Period From Date', 'Evaluation Period To Date'],
        'string_cols': ['Year'],
        'list_fields': {
            'Countries Covered': {'separator': ',', 'clean': True}
        },
        'document_fields': ['Document Subtype', 'File URL', 'File description'],
        'id_gen': {
            'method': 'md5',
            'fields': ['Title', 'Year', 'Project Code']  # fields to hash
        },
        'field_mappings': {
            'Title': 'title',
            'Year': 'year',
            # other mappings
        }
    }

In [None]:
#| eval: false
cfg = iom_input_cfg()
fname = Path('files/test/eval_repo_iom.xlsx')
df = pd.read_excel(fname, sheet_name=cfg['sheet_name'])
df.head(2)

Unnamed: 0,Title,Year,Author,Best Practicesor Lessons Learnt,Date of Publication,Donor,Evaluation Brief,Evaluation Commissioner,Evaluation Coverage,Evaluation Period From Date,...,Type of Evaluator,Level of Evaluation,Document Subtype,File URL,File description,Management response,Date added,Metaevaluation,exclude,reason
0,EX-POST EVALUATION OF THE PROJECT: NIGERIA: S...,2023,Abderrahim El Moulat,Yes,2023-05-10,Government of Germany,Yes,"Donor, IOM",Country,NaT,...,Internal,Decentralized,"Evaluation report, Evaluation brief",https://evaluation.iom.int/sites/g/files/tmzbd...,"Evaluation Report , Evaluation Brief",No,"Fri, 07/07/2023 - 15:35",2020-24,,
1,FINAL EVALUATION OF THE PROJECT: STRENGTHEN BO...,2023,Abderrahim El Moulat,Yes,2023-02-14,Government of Canada,Yes,"Donor, IOM",Multi-country,NaT,...,Internal,Decentralized,"Evaluation report, Evaluation brief",https://evaluation.iom.int/sites/g/files/tmzbd...,"Evaluation Report , Evaluation Brief",No,"Fri, 05/19/2023 - 16:49",2020-24,,


In [None]:
#| exports
class IOMRepoReader(EvalReader):
    def __init__(self, 
                 fname, # path to the excel file
                 max_n=None): # max number of rows to read
        cfg = iom_input_cfg()  
        super().__init__(cfg)
        self.fname = fname
        self.max_n = max_n
    
    def read(self): 
        """Read the excel file and return a dataframe"""
        df = pd.read_excel(self.fname, sheet_name=self.cfg['sheet_name'])
        if self.max_n:
            df = df.head(self.max_n)
        return df
    
    def tfm(self, df):
        """Transform the dataframe into a list of evaluations"""
        df_proc = df.copy()

        # Process dates
        date_cols = self.cfg['date_cols']
        df_proc[date_cols] = df_proc[date_cols].astype(str)
        
        # Process list fields
        for fname, fcfg in self.cfg['list_fields'].items():
            df_proc[fname] = (
                df_proc[fname]
                .astype(str)
                .str.split(fcfg['separator'])
                .apply(lambda x: [item.strip() for item in x if item.strip()])
            )
        
        # Generate IDs
        df_proc['id'] = df_proc.apply(self._mk_id, axis=1)
        
        # Process documents
        df_proc['docs'] = df_proc.apply(self._mk_docs, axis=1)
        
        # Collect metadata
        meta_cols = [col for col in df_proc.columns if col not in ['id', 'docs']]
        
        # Create final structure
        res = []
        for _, row in df_proc.iterrows():
            res.append({
                'id': row['id'],
                'docs': row['docs'],
                'meta': {field: row[field] for field in meta_cols}
            })
        
        return res
    
    def to_json(self, out_path):  
        evals = self()
        with open(out_path, 'w', encoding='utf-8') as f:
            json.dump(evals, f, indent=4, ensure_ascii=False)
    
    def _mk_docs(self, row):
        try:
            stypes = [s.strip() for s in str(row['Document Subtype']).split(', ')]
            urls = [u.strip() for u in str(row['File URL']).split(', ')]
            descs = [d.strip() for d in str(row['File description']).split(', ')]
            
            docs = []
            for stype, url, desc in zip(stypes, urls, descs):
                if url.strip():
                    docs.append({
                        'Document Subtype': stype,
                        'File URL': url,
                        'File description': desc
                    })
            return docs
        except Exception as e:
            print(f"Error processing documents for row: {e}")
            return []
    
    def _mk_id(self, row):
        """Generate MD5 hash from specified fields"""
        id_cfg = self.cfg['id_gen']
        fields = id_cfg['fields']
        
        # Concatenate the specified fields
        id_str = ''.join(str(row[field]) for field in fields)
        
        # Generate MD5 hash
        return hashlib.md5(id_str.encode('utf-8')).hexdigest()

To use the reader:

In [None]:
reader = IOMRepoReader('files/test/eval_repo_iom.xlsx', max_n=10)

In [None]:
evaluations = reader()

The reader produces a list of JSON objects, where each object represents an evaluation with:

- `id`: A unique MD5 hash identifier generated from specified fields
- `docs`: A list of associated documents, each containing:
  - `Document Subtype`: Type of evaluation document (e.g. report, brief)
  - `File URL`: Direct link to download the document
  - `File description`: Brief description of the document contents
- `meta`: Additional metadata about the evaluation

In [None]:
print(evaluations[0].keys())

In [None]:
print(evaluations[0])

Then serialize as `json` for further use:

In [None]:
#| eval: false
reader.to_json('../_data/output/evaluations.json')