# Processing the output of viriation

The output of the Viriation program is processed through the following steps:
1. Reading in the annotations of the mutations -> verifying/pushing changes to our database
2. Reading in user feedback for text chunk data and literature level data -> fine-tuning BERT and LightGBM models
3. Saving intermediary states including a) papers that have been screened through the classifier b) papers that have been annotated already c) user feedback from the annotation front-end

In [1]:
import pandas as pd
from pathlib import Path
import ast
import pickle

In [1]:
from datetime import datetime
from intervaltree import Interval, IntervalTree


class History:
    def __init__(self):
        self.dates = IntervalTree() # Interval search tree with dates that have been screened -> (start date, end date)
        self.papers = {
            'relevant' : set(), # papers that passed screening
            'irrelevant' : set() # papers that were screened out + papers users said were irrelevant
        }


    def checkDateRange(self, date_range):
        """ 
        Determines whether the given date range overlaps with any intervals in the cache of previous scraped dates
        
        Parameters: 
        date_range (tuple): Date range with start date and end date

        Returns:
        bool: Whether the given date range overlaps with any previous date ranges
        """

        start_dt, end_dt = date_range
        start_dt = datetime.strptime(start_dt, '%Y-%m-%d')
        end_dt = datetime.strptime(end_dt, '%Y-%m-%d')
        
        # Convert to timestamp (float) since intervaltree works on numeric values
        start_ts = start_dt.timestamp()
        end_ts = end_dt.timestamp()

        # Query for overlapping intervals in the given range
        overlapping_intervals = self.dates.overlap(start_ts, end_ts)

        return bool(overlapping_intervals)


    def addDateRange(self, date_range):
        """
        Adds new date range into the cache of already scraped dates

        Parameters:
        date_range (tuple): Date range with start date and end date
        """
        start_dt, end_dt = date_range
        start_dt = datetime.strptime(start_dt, '%Y-%m-%d')
        end_dt = end_dt = datetime.strptime(end_dt, '%Y-%m-%d')
        
        start_ts = start_dt.timestamp()
        end_ts = end_dt.timestamp()

        self.dates[start_ts:end_ts] = (start_dt, end_dt) # Add date range
    

    def getNonOverlap(self, date_range):
        """ 
        Returns all dates within the given date range that are not present in the cache of previous scraped dates
        
        Parameters: 
        date_range (tuple): Date range with start date and end date

        Returns:
        list: list of tuples consisting of date ranges that have not been scraped yet
        """
        start_dt, end_dt = date_range
        start_dt = datetime.strptime(start_dt, '%Y-%m-%d')
        end_dt = datetime.strptime(end_dt, '%Y-%m-%d')
        
        # Convert to timestamp (float) since intervaltree works on numeric values
        start_ts = start_dt.timestamp()
        end_ts = end_dt.timestamp()

        # Query for overlapping intervals in the given range
        overlapping_intervals = self.dates.overlap(start_ts, end_ts)

        if not overlapping_intervals:
            return [(start_dt, end_dt)]
        
        overlapping_intervals = sorted(overlapping_intervals)
        non_overlapping_ranges = []
        current_start = start_ts

        # Iterate over each overlapping interval and calculate gaps
        for interval in overlapping_intervals:
            if current_start < interval.begin:
                # There is a gap between the current start and the beginning of this interval
                non_overlapping_ranges.append((current_start, interval.begin))
            # Update current start to the end of the current interval
            current_start = max(current_start, interval.end)
        
        # Check if there's a gap after the last interval
        if current_start < end_ts:
            non_overlapping_ranges.append((current_start, end_ts))
        
        # Convert timestamps back to datetime
        non_overlapping_ranges_dt = [
            (datetime.fromtimestamp(start), datetime.fromtimestamp(end))
            for start, end in non_overlapping_ranges
        ]

        return non_overlapping_ranges_dt


    def updateTree(self):
        """ 
        Merges all overlapping date ranges within the current cache of scraped dates
        """

        self.dates.merge_overlaps() # merge together overlapping intervals
        

    def addPaper(self, paper, relevance):
        """ 
        Updates history of relevant and irrelevant papers that have been processed through the viriation program thus far
        
        Parameters: 
        relevant_papers (str): DOI of paper
        relevance (bool): Whether or not the paper is relevant
        """

        if relevance:
            self.papers['relevant'].add(paper)
        
        else:
            self.papers['irrelevant'].add(paper)
        
        

    def checkPaper(self, paper):
        """ 
        Checks whether or not a specific paper has been processed by our program before
        
        Parameters: 
        paper (str): DOI of paper

        Returns:
        bool: Whether the paper has been processed by our program before
        """

        return paper in self.papers['relevant'] or paper in self.papers['irrelevant']

In [2]:
import pickle
import dill
# Create intermediaries

# Hashtable for managing scraping history
# scraped_papers = {
#     'relevant': set(), # papers that passed screening
#     'irrelevant': set(), # papers that were screened out + papers users said were irrelevant
#     'dates': [] # Dates that have been screened -> (start date, end date)
# }

# scraped_papers.append(('2000-01-01', '2022-09-31'))

h = History()
h.addDateRange(('2000-01-01','2022-01-01'))

with open('../data/database/history.pkl', 'wb') as f:
    dill.dump(h, f)

# Hashtable for managing retrain data in the self-train feature 
retrain_data = {
    'relevant papers': set(), # Positive examples BERT
    'irrelevant papers': set(), # Negative examples BERT
    'relevant text': set(), # Positive examples LightGBM
    'irrelevant text': set() # Negative examples LightGBM
}

with open('../data/database/self_train.pkl', 'wb') as f:
    pickle.dump(retrain_data, f)

In [14]:
h = History()
h.addDateRange(('2000-01-01','2022-01-01'))
h.addDateRange(("2023-03-10", "2023-04-12"))
h.addDateRange(("2023-03-24", "2024-09-20"))

# Print before merging
print("Before merge:")
for interval in h.dates:
    print(f"Start: {datetime.fromtimestamp(interval.begin)}, End: {datetime.fromtimestamp(interval.end)}")

h.updateTree()

# Print after merging
print("After merge:")
for interval in h.dates:
    print(f"Start: {datetime.fromtimestamp(interval.begin)}, End: {datetime.fromtimestamp(interval.end)}")

Before merge:
Start: 2023-03-10 00:00:00, End: 2023-04-12 00:00:00
Start: 2023-03-24 00:00:00, End: 2024-09-20 00:00:00
Start: 2000-01-01 00:00:00, End: 2022-01-01 00:00:00
After merge:
Start: 2023-03-10 00:00:00, End: 2024-09-20 00:00:00
Start: 2000-01-01 00:00:00, End: 2022-01-01 00:00:00


In [12]:
h.addDateRange(("2023-03-10", "2023-04-12"))
h.addDateRange(("2023-03-24", "2024-09-20"))
print(h.dates)
h.updateTree()
intervals = list(h.dates)
print(intervals)

IntervalTree([Interval(946710000.0, 1641020400.0, (datetime.datetime(2000, 1, 1, 0, 0), datetime.datetime(2022, 1, 1, 0, 0))), Interval(1678431600.0, 1681279200.0, (datetime.datetime(2023, 3, 10, 0, 0), datetime.datetime(2023, 4, 12, 0, 0))), Interval(1678431600.0, 1726812000.0), Interval(1679637600.0, 1726812000.0, (datetime.datetime(2023, 3, 24, 0, 0), datetime.datetime(2024, 9, 20, 0, 0)))])
[Interval(1678431600.0, 1726812000.0), Interval(946710000.0, 1641020400.0, (datetime.datetime(2000, 1, 1, 0, 0), datetime.datetime(2022, 1, 1, 0, 0)))]


In [7]:
# STEP 1: Reading data
files = Path('../../data/database/annotations/').glob('*/*')

annotations_data = []

for file in files:
    with open(file, 'r') as f:
        # Read each line (which represents a list in string format)
        for line in f:
            # Convert the string representation of a list to a Python list
            record = ast.literal_eval(line.strip())  # Parse the list
            annotations_data.append(record)  # Add it to our data list

# Convert the list of lists to a DataFrame
# Assuming the data has these columns based on your example: ['Mutation', 'DOI', 'Unknown', 'Attributes', 'Text']
annotations_df = pd.DataFrame(annotations_data, columns=['Mutation', 'DOI', 'Location', 'Effect', 'Text'])

# Display the DataFrame
print(annotations_df.head())

  Mutation                        DOI Location       Effect  \
0    H655Y  10.1101/2023.04.17.536926     None  [Homoplasy]   
1    N679K  10.1101/2023.04.17.536926     None           []   
2    P681H  10.1101/2023.04.17.536926     None           []   

                                                Text  
0  [While most studies focus on receptor binding ...  
1  [While most studies focus on receptor binding ...  
2  [While most studies focus on receptor binding ...  


In [None]:
# STEP 1: Formating data


In [None]:
# STEP 1: Updating data


In [12]:
# STEP 2: Paper level feedback


with open('../../data/database/self-train/irrelevant_papers.pkl', 'rb') as f:
    papers = pickle.load(f)

# Create DataFrame with three columns
irrelevant_df = pd.DataFrame(
    list(papers.items()),  # Convert dictionary to list of tuples
    columns=['DOI', 'Classification']  # Specify column names
)

# Display the DataFrame
print(irrelevant_df)

{'10.1101_2023.07.02.547076': 'irrelevant', '10.1101_2023.04.17.536926': 'relevant'}


In [13]:
# STEP 2: Chunk level feedback
files = Path('../../data/database/self-train/').glob('*.txt')

chunks_data = [] # Negative examples

for file in files:
    with open(file, 'r') as f:
        # Read each line (which represents a list in string format)
        for line in f:
            # record = ast.literal_eval(line.strip())  # Parse the list
            # chunks_data.append(record)  # Add it to our data list
            chunks_data.append([line, "irrelevant"])  # Add it to our data list

chunks_df = pd.DataFrame(chunks_data, columns=["Text", "Classification"])