# Lab Rotation

In [None]:
import pandas as pd
import re as re
import numpy as np
import requests
from urllib.parse import urlparse
from time import sleep

In [None]:
# TODO DELETE before publication
github_token = "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"

## Data Collection
@Misc{acl-ocl,
    author =       {Shaurya Rohatgi, Yanxia Qin, Benjamin Aw, Niranjana Unnithan, Min-Yen Kan},
    title =        {The ACL OCL Corpus: advancing Open science in Computational Linguistics},
    howpublished = {arXiv},
    year =         {2022},
    url =          {https://huggingface.co/datasets/ACL-OCL/ACL-OCL-Corpus}
}

In [None]:
df = pd.read_parquet('data/acl-publication-info.74k.v2.parquet')
df.head()

In [None]:
df = df.rename(columns={"acl_id": "paper_ident", # unique paper identifier
                   "url": "paper_url", # Paper online abstract page URL.
                   "author": "paper_author", # Author list.
                   "title": "paper_title", # Paper title
                   "journal": "paper_venue", # Venue abbreviation.
                   "year": "paper_year", # Publication year
                   "month": "paper_month", # Publication month.
                   "booktitle": "paper_booktitle", # BibTeX booktitle field.
                   "address": "paper_address", # BibTeX adress field
                   "publisher": "paper_publisher", # BibTeX publisher field      
                   "pages": "paper_pages", # BibTeX pages.
                   "full_text": "paper_text",
                   })

df = df.drop(columns=["abstract", "corpus_paper_id", "pdf_hash", "doi",
                              "numcitedby", "number", "volume",  
                              "editor", "isbn", "ENTRYTYPE","ID", "language", "note"])

In [None]:
df['error_download'] = df['paper_text'].apply(lambda x: not x.strip() if isinstance(x, str) else True)

df['error_download'].value_counts()

# BLEU
## Reproducibility
### BLEU identification

In [None]:
df_bleu_prelim = df.copy()
df_bleu_prelim["paper_bleu_prelim"] = df_bleu_prelim["paper_text"].str.contains("bleu", case=False)

df_bleu_prelim['paper_bleu_prelim'].value_counts()

### Paper Review
#### BLEU Parameters

In [None]:
def extract_parameters(text):
    pattern = r"((?: -[a-z123](?: [a-z0-9.]{1,4})?){2,})"
    matches = re.findall(pattern, text)
    return matches[0] if matches else None

In [None]:
df_bleu_params = df_bleu_prelim.copy()
df_bleu_params["paper_bleu_params"] = df_bleu_params.apply(lambda row: extract_parameters(row['paper_text']) if row['paper_bleu_prelim'] else None, axis=1)

df_bleu_params['paper_bleu_params'].notna().sum()

#### BLEU Protocol

In [None]:
bleu_regex_protocol = {
    'ngrams': r"\bn-?grams?\b",
    'precision': r"\bn-?gram\sprecision\b",
    'clipping': r"\bclipping\b",
    'brevity_penalty': r"\bbrevity\spenalty\b|\bBP\b",
    'weights': r"\bweighting\sof\sn-?grams\b",
    'smoothing': r"\bsmoothing\b",
    'tokenization': r'\b(?:tokenized?|tokenizer|tokenization|pre-tokenized?|detokenized?)\b',
    'case_normalization': r'\b(?:case normalization|lowercased|case-insensitive|case sensitive)\b'
}

In [None]:
def search_terms_near_bleu(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Find all occurrences of 'bleu' (case insensitive)
        for match in re.finditer(r'bleu', text, re.IGNORECASE):
            start, end = match.start(), match.end()
            # Define a 500-character window around 'bleu'
            window_start, window_end = max(0, start - 500), min(len(text), end + 500)
            # Search for the term within this window
            if re.search(pattern, text[window_start:window_end], re.IGNORECASE):
                results.append(term)
    return list(set(results))  # Return unique terms

In [None]:
df_bleu_protocol=df_bleu_params.copy()
df_bleu_protocol['paper_bleu_protocol'] = df_bleu_protocol[df_bleu_protocol['paper_bleu_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_bleu(x, bleu_regex_protocol))

df_bleu_protocol["paper_bleu_protocol"].value_counts().head(20)

#### BLEU Variants

In [None]:
bleu_regex_variants = {
    'n_gram_precision': r'\b(?:n-?gram precision|1-gram precision|2-gram precision|3-gram precision|4-gram precision)\b',
    'brevity_penalty': r'\bbrevity penalty\b|BP\b'
}

In [None]:
df_bleu_variants=df_bleu_protocol.copy()
df_bleu_variants['paper_variants'] = df_bleu_variants[df_bleu_variants['paper_bleu_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_bleu(x, bleu_regex_variants))

df_bleu_variants['paper_variants'].value_counts()

#### BLEU Packages

In [None]:
def search_for_regex_pattern(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Search for the pattern in the entire text
        if re.search(pattern, text, re.IGNORECASE):
            results.append(term)
    return list(set(results))  # Return unique terms

In [None]:
regex_bleu_versions = {
    'BLEU_original': r'\bBLEU\b.*?Papineni.*?ACL.*?2002',
    'multi_bleu': r'multi-?bleu|multi_bleu',
    'sacreBLEU': r'sacrebleu',
    'nltk_bleu': r'nltk.*?bleu',
    'mteval_v13a': r'mteval_v?13a',
    'mteval_v14': r'mteval_v?14',
    'BLEU_moses': r'bleu.*?moses',
    'BLEU_nematus': r'bleu.*?nematus',
    'BLEU_coco': r'bleu.*?coco',
    'BLEU_pytorch': r'bleu.*?pytorch',
    'BLEU_tensorflow': r'bleu.*?tensorflow',
    'BLEU_fairseq': r'fairseq.*?bleu',
    'BLEU_sacremoses': r'sacremoses.*?bleu',
    'nematus_bleu': r'nematus.*?bleu',
    'subword_nmt_bleu': r'subword-?nmt.*?bleu',
    'sentence_bleu': r'sentence-?bleu',
    'corpus_bleu': r'corpus-?bleu',
    'smoothing_bleu': r'smoothing.*?bleu',
    "coco": r'coco.*?bleu',
    "pybleu": r'pybleu|py-bleu',
    "google_bleu": r'google.*?bleu',
    "yisi_bleu": r'yisi.*?bleu',
    "bertscore_bleu": r'bertscore.*?bleu',
}

In [None]:
df_bleu_packages = df_bleu_variants.copy()
# Applying the function to the DataFrame
df_bleu_packages['paper_bleu_packages'] = df_bleu_packages[df_bleu_packages['paper_bleu_prelim'] == True]\
    ['paper_text'].apply(lambda x: search_for_regex_pattern(x, regex_bleu_versions))
    
df_bleu_packages["paper_bleu_packages"].value_counts().head(20)

## Code Review
#### URL of code repository cited in paper

In [None]:
df_bleu_url = df_bleu_packages.copy()

# regex for codebases
regex_codebases = r'https?://(?:www\.)?(?:github\.com|gitlab\.com|bitbucket\.org|sourceforge\.net|google\.code|code\.google)[^\s)]*(?<!\.)'

# Function to extract URLs from a text
def extract_codebases(text):
    return re.findall(regex_codebases, text)

# Apply extract_codebases function to 'paper_text', store URLs in a list within each cell
df_bleu_url["code_bleu_url"] = df_bleu_url.apply(
    lambda row: extract_codebases(row['paper_text']) if row['paper_bleu_prelim'] and pd.notnull(row['paper_text']) else [],
    axis=1
)

#### Does the code mention BLEU?

In [None]:
# Extract GitHub repository names from URLs
def extract_github_repo_names(urls):
    return ["/".join(urlparse(url).path.strip("/").split("/")[:2]) for url in urls if "github.com" in urlparse(url).netloc]

# Apply the function to extract GitHub repository names only if 'paper_bleu_prelim' is True
df_bleu_url['code_bleu_github'] = df_bleu_url.apply(lambda row: extract_github_repo_names(row['code_bleu_url']) if row['paper_bleu_prelim'] else [], axis=1)

# Reproducibility

In [None]:
df_bleu_reproducible = df_bleu_url.copy()

# Initialize the 'reproducible' column as a nullable boolean
df_bleu_reproducible['reproducible'] = pd.NA

In [None]:
# R1: Check if both packages and params are not null
condition_r1 = df_bleu_reproducible['paper_bleu_packages'].notna() & df_bleu_reproducible['paper_bleu_params'].notna()
df_bleu_reproducible.loc[condition_r1, 'reproducible'] = True

df_bleu_reproducible['reproducible'].value_counts()

In [None]:
# R2: Check for no configuration packages
# Define the no configuration packages list
no_config_packages = ['Meteor_coco', 'pymeteor', 'nlgeval_meteor', 'nltk_meteor']

def check_reproducibility(row):
    # Only modify if reproducible is False or pd.NA
    if row['reproducible'] is False or pd.isna(row['reproducible']):
        # Check if paper_meteor_packages is a list and not empty or NA
        if isinstance(row['paper_bleu_packages'], list) and row['paper_bleu_packages']:
            # Check if any package in the list requires no configuration
            if any(pkg in no_config_packages for pkg in row['paper_bleu_packages']):
                return True
    return row['reproducible']

# Apply the function to update the 'reproducible' column
df_bleu_reproducible['reproducible'] = df_bleu_reproducible.apply(check_reproducibility, axis=1)

df_bleu_reproducible['reproducible'].value_counts()

In [None]:
df_bleu_reproducible.to_pickle("bleu_paper_review.pkl")

# Export

In [None]:
def save_dataset(df: pd.DataFrame, filepath: str = "bleu_papers.jsonl.gz") -> None:
    """
    Save the DataFrame to a .jsonl.gz file.
    
    Parameters:
    - df: The DataFrame to save.
    - filepath: The file path where the DataFrame should be saved.
    """
    try:
        df.to_json(filepath, orient="records", lines=True, compression="gzip")
        print(f"Dataset successfully saved to {filepath}")
    except Exception as e:
        print(f"Could not save dataset: {e}")

save_dataset(df_bleu_reproducible, "bleu_papers.jsonl.gz")