# Lab Rotation

In [None]:
import pandas as pd
import re as re
import numpy as np
import requests
from urllib.parse import urlparse
from time import sleep

In [None]:
# TODO DELETE before publication
github_token = "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"

## Data Collection
@Misc{acl-ocl,
    author =       {Shaurya Rohatgi, Yanxia Qin, Benjamin Aw, Niranjana Unnithan, Min-Yen Kan},
    title =        {The ACL OCL Corpus: advancing Open science in Computational Linguistics},
    howpublished = {arXiv},
    year =         {2022},
    url =          {https://huggingface.co/datasets/ACL-OCL/ACL-OCL-Corpus}
}

In [None]:
df = pd.read_parquet('data/acl-publication-info.74k.v2.parquet')

df.head()

In [None]:
df = df.rename(columns={"acl_id": "paper_ident", # unique paper identifier
                   "url": "paper_url", # Paper online abstract page URL.
                   "author": "paper_author", # Author list.
                   "title": "paper_title", # Paper title
                   "journal": "paper_venue", # Venue abbreviation.
                   "year": "paper_year", # Publication year
                   "month": "paper_month", # Publication month.
                   "booktitle": "paper_booktitle", # BibTeX booktitle field.
                   "address": "paper_address", # BibTeX adress field
                   "publisher": "paper_publisher", # BibTeX publisher field      
                   "pages": "paper_pages", # BibTeX pages.
                   "full_text": "paper_text",
                   })

df = df.drop(columns=["abstract", "corpus_paper_id", "pdf_hash", "doi",
                              "numcitedby", "number", "volume",  
                              "editor", "isbn", "ENTRYTYPE","ID", "language", "note"])

In [None]:
df['error_download'] = df['paper_text'].apply(lambda x: not x.strip() if isinstance(x, str) else True)

df['error_download'].value_counts()

# ROUGE Scores
## Reproducibility

### ROUGE Identification

In [None]:
df_rouge_prelim = df.copy()
df_rouge_prelim["paper_rouge_prelim"] = df_rouge_prelim["paper_text"].str.contains("rouge", case=False)

df_rouge_prelim['paper_rouge_prelim'].value_counts()

### Paper Review
#### ROUGE Parameters

In [None]:
def extract_parameters(text):
    pattern = r"((?: -[a-z123](?: [a-z0-9.]{1,4})?){2,})"
    matches = re.findall(pattern, text)
    return matches[0] if matches else None

In [None]:
df_rouge_params = df_rouge_prelim.copy()
df_rouge_params["paper_rouge_params"] = df_rouge_params.apply(lambda row: extract_parameters(row['paper_text']) if row['paper_rouge_prelim'] else None, axis=1)


df_rouge_params['paper_rouge_params'].notna().sum()

#### ROUGE Protocol

In [None]:
regex_rouge_protocol = {
    'stemming': r'\b(?:stems?|stemming|stemmer|porter)\b',
    'tokenization': r'\b(?:tokenized?|tokenizer|tokenization|pre-tokenized?|detokenized?)\b',
    'sentence_tokenization': r'sentence split|split sentence|sentence tokeniz|tokenize sentence',
    'stopword_removal': r'\b(?:stop( -)?words?)\b',
    'bootstrapping': r'(?:bootstrap|confidence (?:level|interval))'
}

In [None]:
def search_terms_near_rouge(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Find all occurrences of 'rouge' (case insensitive)
        for match in re.finditer(r'rouge', text, re.IGNORECASE):
            start, end = match.start(), match.end()
            # Define a 500-character window around 'rouge'
            window_start, window_end = max(0, start - 500), min(len(text), end + 500)
            # Search for the term within this window
            if re.search(pattern, text[window_start:window_end], re.IGNORECASE):
                results.append(term)
    return list(set(results))  # Return unique terms

In [None]:
df_rouge_protocol=df_rouge_params.copy()
df_rouge_protocol['paper_rouge_protocol'] = df_rouge_protocol[df_rouge_protocol['paper_rouge_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_rouge(x, regex_rouge_protocol))

df_rouge_protocol["paper_rouge_protocol"].value_counts().head(20)

#### ROUGE Variants

In [None]:
regex_rouge_variants = {
    'precision': r'\b(?:precision)\b',
    'recall': r'\b(?:recall)\b',
    'f-score': r'(?:\b(?:f1?[- ]scores?|f1?[- ]measures?)\b)| f-?1[^a-z0-9]'
}

In [None]:
df_rouge_variants=df_rouge_protocol.copy()
df_rouge_variants['paper_rouge_variants'] = df_rouge_variants[df_rouge_variants['paper_rouge_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_rouge(x, regex_rouge_variants))

df_rouge_variants['paper_rouge_variants'].value_counts()

#### ROUGE packages

In [None]:
def search_for_regex_pattern(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Search for the pattern in the entire text
        if re.search(pattern, text, re.IGNORECASE):
            results.append(term)
    return list(set(results))  # Return unique terms

In [None]:
regex_rouge_packages = {
    'DD/sacrerouge': r'sacrerouge',
    'ND/easyrouge': r'easy.rouge|neural.{0,3}dialogue.{0,3}metrics',
    'CW/sumeval': r'chakki.{0,3}works|sumeval',
    'JG/pyrouegzh': r'py_rouge_zh',
    'AR/gingo': r'asahi-research.{0,5}Gingo',
    'DF/gerouge': r'gerouge',
    'GL/seq2seq': r'seq2seq.{0,5}metrics.{0,5}rouge',
    'GL/rougescore': r'rouge-score|google.research.{0,50}rouge',
    'PT/files2rouge': r'files?2rouge',
    'PC/pyrouge': r'pcyin',   
    'KZ/rougepapier': r'rouge.papier',
    'DI/pyrouge': r'py-rouge|diego999',    
    'PT/pyrouge': r'pltrdy.{0,5}pyrouge',
    'PT/rouge': r'pltrdy[^p]{0,5}rouge|pypi.{0,5}project.{0,5}rouge',
    'AJ/pyrouge': r'andersjo',
    'BZ/pyrouge': r'bheinzerling|pypi.{0,5}project.{0,5}pyrouge|pypi.{0,5}pyrouge',
    'TG/pythonrouge': r'tagucci|pythonrouge',
    'KG/rouge2': r'kavgan|rxnlp|rouge.2\.0|jrouge|java rouge|kavita.ganesan.com',
    'MS/rouge': r'nlg-eval|e2e-metrics|qgevalcap|nmtpytorch|pycocoevalcap|\\btylin\\b|coco-caption',
    'github rouge': r'github.com.{0,50}rouge',
    'unknown pyrouge': r'pyrouge',
    'ROUGE-1.5.5': r'official rouge|rouge toolkit|rouge-?1\.?5\.?5|rouge.{0,15}1.?5.?5.?|rougeeval|berouge\..{0,2}com|cly/.{0,2}rouge|isi\.edu/.{0,2}rouge|isi\.edu/.{0,2}licensed-sw/.{0,2}see/.{0,2}rouge'
}

In [None]:
df_rouge_packages = df_rouge_variants.copy()
# Applying the function to the DataFrame
df_rouge_packages['paper_rouge_packages'] = df_rouge_packages[df_rouge_packages['paper_rouge_prelim'] == True]\
    ['paper_text'].apply(lambda x: search_for_regex_pattern(x, regex_rouge_packages))

df_rouge_packages["paper_rouge_packages"].value_counts().head(20)

In [None]:
df_rouge_packages.to_pickle("rouge_paper_review.pkl")

### Code Review
#### URL of code repository cited in paper

In [None]:
df_rouge_packages = pd.read_pickle("rouge_paper_review.pkl")

In [None]:
df_rouge_url = df_rouge_packages.copy()

# regex for ALL codebases
regex_codebases = r'https?://(?:www\.)?(?:github\.com|gitlab\.com|bitbucket\.org|sourceforge\.net|google\.code|code\.google)[^\s)]*(?<!\.)'

# Function to extract URLs from a text
def extract_codebases(text):
    return re.findall(regex_codebases, text)

# Apply extract_codebases function to 'paper_text', store URLs in a list within each cell
df_rouge_url["code_rouge_url"] = df_rouge_url.apply(
    lambda row: extract_codebases(row['paper_text']) if row['paper_rouge_prelim'] and pd.notnull(row['paper_text']) else [],
    axis=1
)

#### Does the code mention ROUGE?

In [None]:
# Extract GitHub repository names from URLs
def extract_github_repo_names(urls):
    return ["/".join(urlparse(url).path.strip("/").split("/")[:2]) for url in urls if "github.com" in urlparse(url).netloc]

# Apply the function to extract GitHub repository names only if 'paper_rouge_prelim' is True
df_rouge_url['code_rouge_github'] = df_rouge_url.apply(lambda row: extract_github_repo_names(row['code_rouge_url']) if row['paper_rouge_prelim'] else [], axis=1)

## Reproducibility

In [None]:
df_rouge_reproducible = df_rouge_url.copy()

# Initialize the 'reproducible' column as a nullable boolean
df_rouge_reproducible['reproducible'] = pd.NA

In [None]:
# R1: Check if both packages and params are not null
condition_r1 = df_rouge_reproducible['paper_rouge_packages'].notna() & df_rouge_reproducible['paper_rougeparams'].notna()
df_rouge_reproducible.loc[condition_r1, 'reproducible'] = True

df_rouge_reproducible['reproducible'].value_counts()

In [None]:
# R2: Check for no configuration packages
# Define the no configuration packages list
no_config_packages = ['Meteor_coco', 'pymeteor', 'nlgeval_meteor', 'nltk_meteor']

def check_reproducibility(row):
    # Only modify if reproducible is False or pd.NA
    if row['reproducible'] is False or pd.isna(row['reproducible']):
        # Check if paper_meteor_packages is a list and not empty or NA
        if isinstance(row['paper_rouge_packages'], list) and row['paper_rouge_packages']:
            # Check if any package in the list requires no configuration
            if any(pkg in no_config_packages for pkg in row['paper_rouge_packages']):
                return True
    return row['reproducible']

# Apply the function to update the 'reproducible' column
df_rouge_reproducible['reproducible'] = df_rouge_reproducible.apply(check_reproducibility, axis=1)

df_rouge_reproducible['reproducible'].value_counts()

In [None]:
df_rouge_reproducible.to_pickle("rouge_paper_review.pkl")

# Export

In [None]:
def save_dataset(df: pd.DataFrame, filepath: str = "rouge_papers.jsonl.gz") -> None:
    """
    Save the DataFrame to a .jsonl.gz file.
    
    Parameters:
    - df: The DataFrame to save.
    - filepath: The file path where the DataFrame should be saved.
    """
    try:
        df.to_json(filepath, orient="records", lines=True, compression="gzip")
        print(f"Dataset successfully saved to {filepath}")
    except Exception as e:
        print(f"Could not save dataset: {e}")

save_dataset(df_rouge_reproducible, "rouge_papers.jsonl.gz")