In [2]:
from itertools import chain
from collections import defaultdict, Counter
import requests, re, os, json, datetime, tqdm, unicodedata, ast
from typing import Union, List, Dict, Any, Tuple
from IPython.display import HTML
import fitz  # PyMuPDF

current_date = datetime.datetime.now().date()
date_string = current_date.strftime('%Y-%m-%d')

import pandas as pd
import numpy as np

from Bio import Entrez

from extralit.extraction.models import Observation, ITNCondition, EntomologicalOutcome, ClinicalOutcome
from extralit.metrics import *
from extralit.convert import *

import mendeley
from mendeley import Mendeley
from mendeley.auth import MendeleyLoginAuthenticator, MendeleySession, MendeleyClientCredentialsAuthenticator

In [434]:
# Your Mendeley API credentials
client_id = '16777'
client_secret = 'RCleugtlK5scmtU7'
redirect_uri = 'http://localhost/callback'

# Specify the "NetRecalibrationAutoExtract" group ID to download publications
group_id = 'dd44f670-adee-3872-be32-90f7bb5612c4'

## Authenticate

In [21]:
# session = mendeley.start_client_credentials_flow().authenticate()
# session

In [435]:
# These values should match the ones supplied when registering your application.
mendeley = Mendeley(client_id, client_secret, redirect_uri=redirect_uri)

auth = mendeley.start_implicit_grant_flow()

# The user needs to visit this URL, and log in to Mendeley.
login_url = auth.get_login_url()

# # After logging in, the user will be redirected to a URL, auth_response.
display(login_url)

In [None]:
access_token = '' # Add the access token here
session = auth.authenticate(access_token)

## Functions

In [None]:
def first_non_stop_word(words, stop_words = {'the', 'a', 'an', 'is', 'was', 'in', 'to', 'and', 'it', 'of', 'on', 'journal'}):
    if not isinstance(words, list): 
        return None
    for word in words:
        if word.isanum() and word.lower() not in stop_words:
            firsxt_word = re.sub(r'[^\w]', '', word)
            return first_word if first_word.isupper() else first_word.capitalize()
            
    return None  # return None if there is no non-stop word

def create_reference_index(df):
    last_name = df.get('Author', df['authors'].map(lambda x: x[0]['last_name']) if 'authors' in df else None)
    last_name = convert_non_alphanumeric_to_ascii(last_name.str.split("-| ", regex=True, expand=True)[0])
    
    title_split = df['title'].str.split(' |-', regex=True)
    first_title = title_split.apply(first_non_stop_word).str.replace('\W', '', regex=True)
    last_title = title_split.str[-1].str.replace('\W', '', regex=True)
    
    first_source = df.get('Journal', df['source'] if 'source' in df else None).str.split(' |-', regex=True).apply(first_non_stop_word).fillna('')
    pub_year = df.get('Pub_year', df['year'] if 'year' in df else None).astype(str)

    references = last_name.str.lower() + pub_year + first_title.str.lower()
    pmi_idx = last_name == 'PMI'
    references.loc[pmi_idx] = last_name.loc[pmi_idx].str.lower() + pub_year.loc[pmi_idx] + last_title.loc[pmi_idx].str.lower()

    return references

def convert_non_alphanumeric_to_ascii(series, custom_mapping = {'ø': 'o','Ø': 'O',}):
    # Normalize the Unicode string, then encode to ASCII, decode back to string and remove non-alphanumeric characters
    def to_ascii(val):
        for char, ascii_char in custom_mapping.items():
            val = val.replace(char, ascii_char)
        return unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode('ascii')
    
    series = series.apply(lambda x: to_ascii(x) if pd.notna(x) else x)
    series = series.str.replace(r'[^A-Za-z0-9-]', '', regex=True)
    
    return series

def get_unique_counter(df: pd.DataFrame, index: Union[str, List[str]], columns: List[str], prefix='-', suffix='', n_digits=2):
    if not isinstance(columns, list):
        columns = list(columns)

    counter_series = df.groupby(index)\
        .apply(lambda df_: assign_group_id(df_, columns))\
        .reset_index(level=list(range(len(index))) if isinstance(index, list) else 0, drop=True)

    na_ids = df.index[df[columns].replace('NA', None).isna().all(axis=1)]
    counter_series.loc[na_ids] = 0
    
    counter_ids = counter_series.map(lambda x: f'{prefix}{int(x):0{n_digits}}{suffix}').sort_index()
    
    assert counter_ids.index.size == df.index.size
    
    return counter_ids


def assign_group_id(df, group_columns):
    # Filter out any group_columns not in the DataFrame
    valid_group_columns = [col for col in group_columns if col in df.columns]

    # Now perform the groupby with the valid columns only
    return df.groupby(valid_group_columns).ngroup() + 1


def check_unique_combinations(df, groupby_column, columns_to_check):
    # Group the DataFrame by the groupby column
    groupby_column = [*groupby_column]
    if columns_to_check not in groupby_column:
        groupby_column = groupby_column + ([columns_to_check] if isinstance(columns_to_check, str) else columns_to_check)
        
    grouped_df = df.dropna(how='all').groupby(groupby_column)

    # Iterate over each group
    for name, group in grouped_df:
        # Create a DataFrame with only the columns to check
        check_df = group[columns_to_check]

        # Check if all rows in check_df are unique
        assert (check_df.nunique() <= 1).all(), \
            f'{name} has duplicates in {check_df[check_df.duplicated(keep=False)].head(4)}'

    return True

## Load Documents

### Review ITN master list with non-included

In [3]:
itn_master_list_w_non_incl = pd.read_csv('data/external/itn_master_list_with_non_inclusions.csv', index_col='reference_key')
itn_master_list_w_non_incl

In [525]:
n_itn = {'included': list(), 'not_included': list()}
y_itn = {'included': list(), 'not_included': list()}
for ref, s in itn_master_list_w_non_incl.query("Include == 'N'")[['Include', 'Notes']].iterrows():
    paper = s.to_frame().T
    
    if (papers.index.str.lower()==ref.lower()).sum():
        n_itn['included'].append(paper)
    else:
        n_itn['not_included'].append(paper)

for ref, s in itn_master_list_w_non_incl.query("Include == 'Y'")[['Include', 'Notes']].iterrows():
    paper = s.to_frame().T
    
    if (papers.index.str.lower()==ref.lower()).sum():
        y_itn['included'].append(paper)
    else:
        y_itn['not_included'].append(paper)

In [526]:
papers.shape

In [527]:
tally = pd.concat([
    pd.concat({k: pd.concat(v) for k,v in n_itn.items()}, names=['master_paper_list', 'reference']),
    pd.concat({k: pd.concat(v) for k,v in y_itn.items()}, names=['master_paper_list', 'reference'])
]).reset_index(0)
tally.to_csv("/Users/jonnytr/Desktop/itn_master_list_vs_master_paper_list.csv")
tally.shape

### Download mendeley collections

In [362]:
{group.name: group.id for group in session.groups.list().items}

In [439]:
group = session.groups.get('dd44f670-adee-3872-be32-90f7bb5612c4')
docs = group.documents.list(page_size=500).items
docs_df = pd.DataFrame.from_dict([doc.__dict__['json'] for doc in docs])

docs_df['source'] = normalize_publisher_names(docs_df['source'])
docs_df['reference'] = create_reference_index(docs_df)
docs_df = docs_df.join(docs_df['identifiers'].apply(pd.Series)[['issn', 'pmid', 'doi']])
docs_df.sort_values(by='pmid')
docs_df = docs_df.drop_duplicates(subset=['reference']).set_index('reference')
docs_df['collections'] = None
docs_df

### Load mendeley collection of papers

In [440]:
convert_obj = lambda x: ast.literal_eval(x) if isinstance(x, str) and x else None
converters = {'identifiers': convert_obj, 'authors': convert_obj, 'keywords': convert_obj}

ento_docs_df = pd.read_csv('data/processed/ento_papers.csv', 
                           converters=converters)
clinical_docs_df = pd.read_csv('data/processed/clinical_papers.csv', 
                           converters=converters)
net_docs_df = pd.read_csv('data/processed/net_papers.csv', 
                           converters=converters)
pmi_docs_df = pd.read_csv('data/processed/pmi_papers.csv', 
                           converters=converters).fillna({'source': "PMI"})
update_docs_df = pd.read_csv('data/processed/updatemarch2023_papers.csv', 
                           converters=converters)

for df, collection in zip([ento_docs_df, clinical_docs_df, net_docs_df, pmi_docs_df, update_docs_df], 
                          ['Entomological Outcomes', 'Human Health Outcomes', 'Net Outcomes', 'PMI Durability Monitoring Report', 'UpdateMarch2023']):
    df['reference'] = create_reference_index(df)
    if 'identifiers' in df.columns:
        df = df.join(df['identifiers'].apply(pd.Series)[['issn', 'pmid', 'doi']])
    df.set_index('reference', inplace=True)
    docs_df.loc[df.index, 'collections'] = docs_df.loc[df.index, 'collections'].map(
        lambda x: [*x, collection] if isinstance(x, (list, np.ndarray)) else [collection])

In [451]:
papers = pd.read_csv('data/external/IDM ITN Recalibration_ Master paper list - Sheet1.csv', dtype={'pmid': str})\
    .rename(columns={
        'First Author': 'Author', 
        'Publication year': 'Pub_year', 
        'Title': 'title', 
        'Reference (new)': 'reference',
        'Needs additional digitization? Y/N': 'Needs_digitization',
        'Measured Outcome': 'Measured_outcome', 
        'Approximate Number of Rows to Extract': 'Approx_num_rows', 
        'Table Numbers': 'Tables_count', 
        'Check out: Initials': 'Check_out_by',
        'Check in: Initials': 'Check_in_by',
        'Needs additional digitization? Y/N': 'Needs_digitization',
        'Notes on esimated extraction count': 'Notes_on_approx_num_rows',
        'Notes on extraction': 'Notes_on_extraction',
    })
papers = papers.assign(
    Needs_digitization=papers["Needs_digitization"].apply(normalize_need_digitalization),
)
papers = papers.map(lambda x: x.strip() if isinstance(x, str) else x)
papers['title'] = papers['title'].str.strip("{}.").tolist()
papers['reference'].update(create_reference_index(papers).drop(papers['reference'].dropna().index))
papers.set_index('reference', inplace=True)

papers

In [452]:
papers = papers.drop(columns=['Author', 'Pub_year', 'title', 'Journal'], errors='ignore').join(docs_df, how='left')
papers

### Count subgroups

In [360]:
[pd.Index(papers['reference']).intersection(ento_docs_df['reference']).size,
 pd.Index(papers['reference']).intersection(clinical_docs_df['reference']).size,
 pd.Index(papers['reference']).intersection(net_docs_df['reference']).size,
 pd.Index(papers['reference']).intersection(pmi_docs_df['reference']).size,
 pd.Index(papers['reference']).intersection(update_docs_df['reference']).size,
]

In [361]:
allmendeley_papers = pd.Index(ento_docs_df['reference'])\
.union(clinical_docs_df['reference'])\
.union(net_docs_df['reference'])\
.union(pmi_docs_df['reference'])\
.union(update_docs_df['reference'])
display(allmendeley_papers.size)

allmendeley_papers.difference(papers['reference']).size

In [362]:
pd.Index(papers['reference']).difference(allmendeley_papers)

In [363]:
[pd.Index(ento_docs_df['reference']).difference(pd.Index(papers['reference'])).size,
 pd.Index(clinical_docs_df['reference']).difference(pd.Index(papers['reference'])).size,
 pd.Index(net_docs_df['reference']).difference(pd.Index(papers['reference'])).size,
 pd.Index(pmi_docs_df['reference']).difference(pd.Index(papers['reference'])).size,
 pd.Index(update_docs_df['reference']).difference(pd.Index(papers['reference'])).size,
]

### Create reference index

In [325]:
if 'reference' not in docs_df.columns and 'reference' not in docs_df.index.names:
        
    references = create_reference_index(docs_df)
    dup_idx = references[references.duplicated(keep=False) & references.notna()].sort_values().index
    display(docs_df.loc[dup_idx])
    
    docs_df['reference'] = references
    docs_df = docs_df.drop(docs_df['reference'][docs_df['reference'].duplicated()].index)
    
    docs_df['reference'].duplicated().sum()
    docs_df['reference'].head()

In [365]:
# doi duplicates
dois = docs_df['doi']
display({'NA': dois.isna().sum(), 'duplicated': dois.dropna().duplicated().sum()})

dup_idx = dois[dois.duplicated(keep=False) & dois.notna()].sort_values().index
display(docs_df.loc[dup_idx])

docs_df['identifiers'] = docs_df['identifiers'].replace('', np.nan)

In [366]:
# pmid duplicates
pmids = docs_df['pmid']
display({'NA': pmids.isna().sum(), 'duplicated': pmids.dropna().duplicated().sum()})
dup_idx = pmids[pmids.duplicated(keep=False) & pmids.notna()].sort_values().index
display(docs_df.loc[dup_idx])

In [367]:
docs_df.index.nunique()

# Generate extraction queue

In [5]:
addl_fig_dig_papers = [
    'Agossa2014', 'Bayili2017', 'Bayili2019', 'Birhanu2019', 
    'Djenontin2010', 'Duchon2009', 'Etang2013', 'Etang2016', 'Kayedi2015', 
    'Kolaczinski2000', 'Koudou2011', 'Lindblade2005', 'NGuessan2016', 
    'Ngufor2017', 'Norris2011', 'Ohashi2012', 'Okumu2012', 'Randriamaherijaona2017', 
    'Riveron2018', 'Sternberg2014', 'Tchakounte2019', 'Tungu2015', 
    'Tungu2016', 'Wanjala2015', 'Yewhalaw2012',
]

pd.Index(docs_df['Mendeley Reference Key'].dropna()).intersection(addl_fig_dig_papers).size, len(addl_fig_dig_papers), set(addl_fig_dig_papers).difference(docs_df['Mendeley Reference Key'].dropna())

anchor_index = (docs_df['Needs_digitization']==True).idxmax()
print(anchor_index)
index_values = [idx for idx, row in docs_df.iterrows() if row['Mendeley Reference Key'] in addl_fig_dig_papers]

# Split the DataFrame into parts: before the anchor, the anchor, and after the anchor
before_anchor = docs_df.loc[:anchor_index].drop(anchor_index)
after_anchor = docs_df.loc[anchor_index:]

# Find rows in index_values that are before the anchor index
rows_to_move = before_anchor.loc[before_anchor.index.isin(index_values)]

# Remove rows from 'before_anchor' if they're in 'rows_to_move'
before_anchor = before_anchor.drop(rows_to_move.index)

# Concatenate the DataFrame parts, adding 'rows_to_move' after the 'anchor_index'
docs_df_rearranged = pd.concat([before_anchor, rows_to_move, after_anchor])
docs_df.shape, docs_df_rearranged.shape, rows_to_move.shape, (docs_df_rearranged['Needs_digitization']==True).idxmax()

In [10]:
docs_df_rearranged.loc[(docs_df_rearranged['Needs_digitization'] != True) & docs_df_rearranged['Mendeley Reference Key'].isin(addl_fig_dig_papers), 'Needs_digitization'] = True
(docs_df_rearranged['Needs_digitization']==True).idxmax()

In [27]:
extractions = pd.read_csv('data/processed/extractions_preJonny_cleaned.csv', index_col='reference',
                          dtype={'Start_year': pd.Int64Dtype(), 
                                 'End_year': pd.Int64Dtype()})
extractions['IDM_check1'].value_counts()

In [385]:
extractions.index.nunique()

In [405]:
first_round = docs_df.query('`Mendeley Reference Key` in @extractions.index and '
                            'Needs_digitization != True and source.notnull()')
is_net_outcome = first_round.collections.apply(lambda li: 'Net Outcomes' in li if li else False)
first_round_ento_human = first_round.loc[~is_net_outcome]
first_round_net = first_round.loc[is_net_outcome]
first_round_pmi = docs_df.query('`Mendeley Reference Key` in @extractions.index and Needs_digitization != True '
                                'and `Mendeley Reference Key` not in @addl_fig_dig_papers and source.isnull()')
second_round = docs_df.query('`Mendeley Reference Key` in @extractions.index and Needs_digitization == True '
                             'or `Mendeley Reference Key` in addl_fig_dig_papers')
third_round = docs_df.query('`Mendeley Reference Key` not in @extractions.index')

assert first_round_ento_human.shape[0] + first_round_net.shape[0] + first_round_pmi.shape[0] + second_round.shape[0] + third_round.shape[0] == docs_df.shape[0]

In [410]:
docs_df.shape[0]

In [409]:
first_round_ento_human.shape[0], first_round_net.shape[0], first_round_pmi.shape[0], second_round.shape[0], third_round.shape[0]

In [399]:
docs_df = pd.concat([
    first_round_ento_human.sample(frac=1), 
    first_round_net.sample(frac=1), 
    first_round_pmi.sample(frac=1), 
    second_round.sample(frac=1), 
    third_round.sample(frac=1)])
docs_df['collections'].head(20)

# Get file manifest

In [456]:
# docs_df = pd.read_parquet('config/extraction_queue_198papers_2024-04-02.parquet')
# docs_df

In [443]:
def get_file_manifest(doc: pd.Series) -> dict:
    files = session.document_files(doc.id).list().items
    if not files:
        return None
    return files[0].__dict__['json']
    
files_df = papers.apply(get_file_manifest, axis=1)
files_df = pd.json_normalize(files_df)
files_df.dropna()

In [454]:
papers = papers.join(files_df.rename(columns={'id': 'file_id', 'document_id': 'id'}).set_index('id').drop(columns=['created']), 
            on='id')
papers[papers['file_name'].isna()]

### Rename filenames

In [447]:
def normalize_filename(filename):
    filename = re.sub(r'[\\/:*?"<>|]|\A\.', '_', filename)
    filename = filename[:255]

    return filename

In [462]:
dup_file_names = papers.index[papers['file_name'].duplicated(keep=False) & papers['file_name'].notnull()]
papers.loc[dup_file_names, 'file_name'] = papers.loc[dup_file_names].index.values
papers['file_name'] = papers['file_name'].map(normalize_filename)
dup_file_names.size

In [465]:
no_pdf_ext_files = papers['file_name'][~papers['file_name'].fillna('').str.endswith('.pdf')].dropna().index
papers.loc[no_pdf_ext_files, 'file_name'] = papers.loc[no_pdf_ext_files, 'file_name'] + '.pdf'
no_pdf_ext_files.size

### Download files

In [468]:
def download_file(filepath, document_id, file_id, access_token):
    if not (isinstance(filepath, str) and filepath and document_id and isinstance(file_id, str) and access_token): 
        return None
    elif os.path.exists(filepath):
        return filepath
    
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Accept': 'application/pdf',
    }
    url = f'https://api.mendeley.com/files/{file_id}'
    response = requests.get(url, headers=headers)
    
    try:
        response.raise_for_status()  # check for request errors
    except Exception as e:
        print(e)
        return None

    with open(filepath, 'wb') as file:
        file.write(response.content)
        
    return filepath

In [469]:
download_dir = 'data/pdf/'

downloaded_filenames = papers.apply(
    lambda doc: download_file(os.path.join(download_dir, doc['file_name']) if isinstance(doc['file_name'], str) else None,
                              doc.name, 
                              doc['file_id'], 
                              session.access_token),
    axis=1)
downloaded_filenames.dropna()

In [484]:
downloaded_filenames.name = "file_path"
papers = papers.join(downloaded_filenames)
papers.head()

# Write files manifest

In [485]:
papers.to_parquet(f'config/extraction_queue_{papers.shape[0]}papers_{current_date.strftime("%Y-%m-%d")}.parquet')

### Update file_names

In [8]:
def normalize_filename(file_path):
    # Extract directory path, file name, and extension
    directory, file_name = os.path.split(file_path)
    file_name, file_extension = os.path.splitext(file_name)
    
    # Replace spaces with underscores and remove special characters
    file_name = re.sub(r'[^\w\s-]', '', file_name.replace(' ', '_'))
    
    # Remove any non-ASCII characters (optional, depending on your requirements)
    file_name = file_name.encode('ascii', 'ignore').decode()
    
    # Ensure the file name length does not exceed 255 characters (excluding the length of the extension)
    max_length = 255 - len(file_extension)
    file_name = file_name[:max_length]
    
    # Reconstruct the full file path with the original extension
    normalized_path = os.path.join(directory, file_name + file_extension)
    return normalized_path

def normalize_file_paths_in_df(df, column_name='file_path', rename_files=False):
    # Create a new column for the normalized paths
    new_paths = df[column_name].apply(normalize_filename)

    # Rename files on the filesystem
    for old_path, new_path in zip(df[column_name], new_paths):
        if old_path != new_path and rename_files:
            try:
                os.rename(old_path, new_path)
                print(f"Renamed '{old_path}' to '{new_path}'")
            except OSError as e:
                print(f"Error renaming '{old_path}' to '{new_path}': {e}")

    # Update the original file_path column to reflect the new file names
    df[column_name] = new_paths
    
    return df

In [13]:
papers = pd.read_parquet(f'config/extraction_queue_197papers_2024-04-03.parquet')
papers = normalize_file_paths_in_df(papers, column_name='file_path', rename_files=True)

In [20]:
papers.to_parquet(f'config/extraction_queue_197papers_2024-04-03.parquet')


In [19]:
papers['file_name'] = papers['file_path'].map(lambda x: os.path.split(x)[-1])

# Draw annotations on PDFs

## Get all annotations

In [52]:
anns_df = pd.DataFrame()

# Get the first page of annotations
pages = session.annotations.list(page_size=200)

while pages is not None:
    # Get the current page of annotations and append to anns_df
    anns = pages.items
    current_page_df = pd.DataFrame([ann.__dict__['json'] for ann in anns])
    anns_df = pd.concat([anns_df, current_page_df], ignore_index=True)
    
    # Move on to the next page
    pages = pages.next_page
anns_df

## Reference document_id to the reference

In [56]:
def get_document_info(doc_id):
    # Fetch the document by document_id
    document = session.documents.get(doc_id)
    doc_json = document.json

    return pd.Series(doc_json, name=doc_id)

In [88]:
# anns_docs = anns_df['document_id'].drop_duplicates().map(get_document_info)
# anns_docs = pd.DataFrame(anns_docs.tolist())
anns_docs.index.name = 'document_id'

anns_docs = anns_docs.dropna(subset=['authors', 'year'])
anns_docs['year'] = anns_docs['year'].astype(int)
anns_docs['reference'] = create_reference_index(anns_docs)
anns_docs = anns_docs.drop(anns_docs.index[anns_docs['reference'].duplicated()])
anns_docs

## Join annotations to reference

In [90]:
anns_df = anns_df.join(anns_docs['reference'], on='document_id')
anns_df.head()

In [160]:
pd.json_normalize(pd.json_normalize(anns_df['positions'].dropna())[0])

### Download annotations

In [228]:
def draw_annotations(pdf_file_path, annotations):
    if not (os.path.exists(pdf_file_path) and len(annotations)): return
        
    with fitz.open(pdf_file_path) as pdf_document:
        pages = {}
        
        for annotation in annotations:
            for position in annotation['positions']:
                page_number = position['page'] - 1  # Pages are 0-indexed in PyMuPDF
                page = pages.setdefault(page_number, pdf_document.load_page(page_number))
                
                # Get the dimensions of the page
                page_width, page_height = page.rect[2], page.rect[3]
                
                # invert the y-coordinates of the highlight rectangle
                rect = fitz.Rect(
                    position['top_left']['x'], page_height - position['bottom_right']['y'],
                    position['bottom_right']['x'], page_height - position['top_left']['y'],
                )
                
                highlight = page.add_highlight_annot(rect)
                # Update the highlight annotation with color
                color = tuple([v/255 for v in annotation['color'].values()])
                highlight.set_colors({"stroke": color, "fill": color})
                highlight.set_opacity(0.8)  # Set opacity to 50%
                highlight.update()
                
        output_pdf_path = pdf_file_path #.replace("data/pdf", "data/pdf-annotated")
        pdf_document.save(output_pdf_path, incremental=True)

In [225]:
file = fitz.open('data/pdf-annotated/Corbel_2010_Field_efficacy_of_a_new_mosaic_long.pdf')
file.save()

In [229]:
for index, row in tqdm.tqdm(docs_df.iterrows()):
    ref = row['reference']
    pdf_file_path = row['file_path']
    if not isinstance(pdf_file_path, str):
        continue
    
    # Filter annotations by document_id
    doc_annotations = anns_df[(anns_df['reference'] == ref) & anns_df['positions'].notnull()].to_dict(orient='records')
    if not len(doc_annotations): 
        continue
    draw_annotations(pdf_file_path, doc_annotations)