# Lab Rotation

In [1]:
import pandas as pd
import re as re
import numpy as np
import requests
from urllib.parse import urlparse
from time import sleep

In [2]:
# TODO 
github_token = "ADD_YOUR_GITHUB_TOKEN"

## Data Collection
@Misc{acl-ocl,
    author =       {Shaurya Rohatgi, Yanxia Qin, Benjamin Aw, Niranjana Unnithan, Min-Yen Kan},
    title =        {The ACL OCL Corpus: advancing Open science in Computational Linguistics},
    howpublished = {arXiv},
    year =         {2022},
    url =          {https://huggingface.co/datasets/ACL-OCL/ACL-OCL-Corpus}
}

In [3]:
df = pd.read_parquet('data/acl-publication-info.74k.v2.parquet')

df.head()

Unnamed: 0,acl_id,abstract,full_text,corpus_paper_id,pdf_hash,numcitedby,url,publisher,address,year,...,doi,number,volume,journal,editor,isbn,ENTRYTYPE,ID,language,note
0,O02-2002,There is a need to measure word similarity whe...,There is a need to measure word similarity whe...,18022704,0b09178ac8d17a92f16140365363d8df88c757d0,14,https://aclanthology.org/O02-2002,,,2002,...,,,,,,,inproceedings,chen-you-2002-study,,
1,L02-1310,,,8220988,8d5e31610bc82c2abc86bc20ceba684c97e66024,93,http://www.lrec-conf.org/proceedings/lrec2002/...,European Language Resources Association (ELRA),"Las Palmas, Canary Islands - Spain",2002,...,,,,,,,inproceedings,mihalcea-2002-bootstrapping,,
2,R13-1042,Thread disentanglement is the task of separati...,Thread disentanglement is the task of separati...,16703040,3eb736b17a5acb583b9a9bd99837427753632cdb,10,https://aclanthology.org/R13-1042,"INCOMA Ltd. Shoumen, BULGARIA","Hissar, Bulgaria",2013,...,,,,,,,inproceedings,jamison-gurevych-2013-headerless,,
3,W05-0819,"In this paper, we describe a word alignment al...","In this paper, we describe a word alignment al...",1215281,b20450f67116e59d1348fc472cfc09f96e348f55,15,https://aclanthology.org/W05-0819,Association for Computational Linguistics,"Ann Arbor, Michigan",2005,...,,,,,,,inproceedings,aswani-gaizauskas-2005-aligning,,
4,L02-1309,,,18078432,011e943b64a78dadc3440674419821ee080f0de3,12,http://www.lrec-conf.org/proceedings/lrec2002/...,European Language Resources Association (ELRA),"Las Palmas, Canary Islands - Spain",2002,...,,,,,,,inproceedings,suyaga-etal-2002-proposal,,


In [4]:
df = df.rename(columns={"acl_id": "paper_ident", # unique paper identifier
                   "url": "paper_url", # Paper online abstract page URL.
                   "author": "paper_author", # Author list.
                   "title": "paper_title", # Paper title
                   "journal": "paper_venue", # Venue abbreviation.
                   "year": "paper_year", # Publication year
                   "month": "paper_month", # Publication month.
                   "booktitle": "paper_booktitle", # BibTeX booktitle field.
                   "address": "paper_address", # BibTeX adress field
                   "publisher": "paper_publisher", # BibTeX publisher field      
                   "pages": "paper_pages", # BibTeX pages.
                   "full_text": "paper_text",
                   })

df = df.drop(columns=["abstract", "corpus_paper_id", "pdf_hash", "doi",
                              "numcitedby", "number", "volume",  
                              "editor", "isbn", "ENTRYTYPE","ID", "language", "note"])

In [5]:
df['error_download'] = df['paper_text'].apply(lambda x: not x.strip() if isinstance(x, str) else True)

df['error_download'].value_counts()

error_download
False    67414
True      5871
Name: count, dtype: int64

# ROUGE Scores
## Reproducibility

### ROUGE Identification

In [6]:
df_rouge_prelim = df.copy()
df_rouge_prelim["paper_rouge_prelim"] = df_rouge_prelim["paper_text"].str.contains("rouge", case=False)

df_rouge_prelim['paper_rouge_prelim'].value_counts()

paper_rouge_prelim
False    64862
True      2593
Name: count, dtype: int64

### Paper Review
#### ROUGE Parameters

In [7]:
def extract_parameters(text):
    pattern = r"((?: -[a-z123](?: [a-z0-9.]{1,4})?){2,})"
    matches = re.findall(pattern, text)
    return matches[0] if matches else None

In [8]:
df_rouge_params = df_rouge_prelim.copy()
df_rouge_params["paper_rouge_params"] = df_rouge_params.apply(lambda row: extract_parameters(row['paper_text']) if row['paper_rouge_prelim'] else None, axis=1)


df_rouge_params['paper_rouge_params'].notna().sum()

25

#### ROUGE Protocol

In [9]:
regex_rouge_protocol = {
    'stemming': r'\b(?:stems?|stemming|stemmer|porter)\b',
    'tokenization': r'\b(?:tokenized?|tokenizer|tokenization|pre-tokenized?|detokenized?)\b',
    'sentence_tokenization': r'sentence split|split sentence|sentence tokeniz|tokenize sentence',
    'stopword_removal': r'\b(?:stop( -)?words?)\b',
    'bootstrapping': r'(?:bootstrap|confidence (?:level|interval))'
}

In [10]:
def search_terms_near_rouge(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Find all occurrences of 'rouge' (case insensitive)
        for match in re.finditer(r'rouge', text, re.IGNORECASE):
            start, end = match.start(), match.end()
            # Define a 500-character window around 'rouge'
            window_start, window_end = max(0, start - 500), min(len(text), end + 500)
            # Search for the term within this window
            if re.search(pattern, text[window_start:window_end], re.IGNORECASE):
                results.append(term)
    return list(set(results))  # Return unique terms

In [11]:
df_rouge_protocol=df_rouge_params.copy()
df_rouge_protocol['paper_rouge_protocol'] = df_rouge_protocol[df_rouge_protocol['paper_rouge_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_rouge(x, regex_rouge_protocol))

df_rouge_protocol["paper_rouge_protocol"].value_counts().head(20)

paper_rouge_protocol
[]                                                      2134
[stemming]                                               145
[bootstrapping]                                           97
[tokenization]                                            67
[stemming, stopword_removal]                              41
[stopword_removal]                                        40
[stemming, bootstrapping]                                 15
[stemming, bootstrapping, stopword_removal]               11
[stemming, tokenization]                                  10
[tokenization, sentence_tokenization]                      8
[tokenization, bootstrapping]                              7
[stemming, tokenization, stopword_removal]                 6
[sentence_tokenization]                                    3
[bootstrapping, stopword_removal]                          3
[tokenization, bootstrapping, sentence_tokenization]       2
[stemming, bootstrapping, sentence_tokenization]           2
[st

#### ROUGE Variants

In [12]:
regex_rouge_variants = {
    'precision': r'\b(?:precision)\b',
    'recall': r'\b(?:recall)\b',
    'f-score': r'(?:\b(?:f1?[- ]scores?|f1?[- ]measures?)\b)| f-?1[^a-z0-9]'
}

In [13]:
df_rouge_variants=df_rouge_protocol.copy()
df_rouge_variants['paper_rouge_variants'] = df_rouge_variants[df_rouge_variants['paper_rouge_prelim'] == True]['paper_text'].apply(lambda x: search_terms_near_rouge(x, regex_rouge_variants))

df_rouge_variants['paper_rouge_variants'].value_counts()

paper_rouge_variants
[]                              1425
[f-score]                        374
[recall, precision, f-score]     242
[recall]                         215
[recall, precision]              160
[recall, f-score]                103
[precision]                       50
[precision, f-score]              24
Name: count, dtype: int64

#### ROUGE packages

In [14]:
def search_for_regex_pattern(text, regex_dict):
    results = []
    for term, pattern in regex_dict.items():
        # Search for the pattern in the entire text
        if re.search(pattern, text, re.IGNORECASE):
            results.append(term)
    return list(set(results))  # Return unique terms

In [15]:
regex_rouge_packages = {
    'DD/sacrerouge': r'sacrerouge',
    'ND/easyrouge': r'easy.rouge|neural.{0,3}dialogue.{0,3}metrics',
    'CW/sumeval': r'chakki.{0,3}works|sumeval',
    'JG/pyrouegzh': r'py_rouge_zh',
    'AR/gingo': r'asahi-research.{0,5}Gingo',
    'DF/gerouge': r'gerouge',
    'GL/seq2seq': r'seq2seq.{0,5}metrics.{0,5}rouge',
    'GL/rougescore': r'rouge-score|google.research.{0,50}rouge',
    'PT/files2rouge': r'files?2rouge',
    'PC/pyrouge': r'pcyin',   
    'KZ/rougepapier': r'rouge.papier',
    'DI/pyrouge': r'py-rouge|diego999',    
    'PT/pyrouge': r'pltrdy.{0,5}pyrouge',
    'PT/rouge': r'pltrdy[^p]{0,5}rouge|pypi.{0,5}project.{0,5}rouge',
    'AJ/pyrouge': r'andersjo',
    'BZ/pyrouge': r'bheinzerling|pypi.{0,5}project.{0,5}pyrouge|pypi.{0,5}pyrouge',
    'TG/pythonrouge': r'tagucci|pythonrouge',
    'KG/rouge2': r'kavgan|rxnlp|rouge.2\.0|jrouge|java rouge|kavita.ganesan.com',
    'MS/rouge': r'nlg-eval|e2e-metrics|qgevalcap|nmtpytorch|pycocoevalcap|\\btylin\\b|coco-caption',
    'github rouge': r'github.com.{0,50}rouge',
    'unknown pyrouge': r'pyrouge',
    'ROUGE-1.5.5': r'official rouge|rouge toolkit|rouge-?1\.?5\.?5|rouge.{0,15}1.?5.?5.?|rougeeval|berouge\..{0,2}com|cly/.{0,2}rouge|isi\.edu/.{0,2}rouge|isi\.edu/.{0,2}licensed-sw/.{0,2}see/.{0,2}rouge'
}

In [16]:
df_rouge_packages = df_rouge_variants.copy()
# Applying the function to the DataFrame
df_rouge_packages['paper_rouge_packages'] = df_rouge_packages[df_rouge_packages['paper_rouge_prelim'] == True]\
    ['paper_text'].apply(lambda x: search_for_regex_pattern(x, regex_rouge_packages))

df_rouge_packages["paper_rouge_packages"].value_counts().head(20)

paper_rouge_packages
[]                                             2319
[ROUGE-1.5.5]                                   111
[MS/rouge]                                       32
[unknown pyrouge]                                29
[GL/rougescore]                                  15
[KG/rouge2]                                      14
[DI/pyrouge]                                     10
[github rouge]                                    5
[unknown pyrouge, github rouge, BZ/pyrouge]       5
[PT/files2rouge]                                  5
[ROUGE-1.5.5, unknown pyrouge]                    4
[PT/rouge, DI/pyrouge]                            3
[unknown pyrouge, BZ/pyrouge]                     3
[github rouge, PT/files2rouge]                    3
[github rouge, GL/rougescore]                     3
[ND/easyrouge]                                    2
[github rouge, PT/rouge]                          2
[DD/sacrerouge]                                   2
[CW/sumeval]                               

In [17]:
df_rouge_packages.to_pickle("rouge_paper_review.pkl")

### Code Review
#### URL of code repository cited in paper

In [18]:
df_rouge_packages = pd.read_pickle("rouge_paper_review.pkl")

In [19]:
df_rouge_url = df_rouge_packages.copy()

# regex for ALL codebases
regex_codebases = r'https?://(?:www\.)?(?:github\.com|gitlab\.com|bitbucket\.org|sourceforge\.net|google\.code|code\.google)[^\s)]*(?<!\.)'

# Function to extract URLs from a text
def extract_codebases(text):
    return re.findall(regex_codebases, text)

# Apply extract_codebases function to 'paper_text', store URLs in a list within each cell
df_rouge_url["code_rouge_url"] = df_rouge_url.apply(
    lambda row: extract_codebases(row['paper_text']) if row['paper_rouge_prelim'] and pd.notnull(row['paper_text']) else [],
    axis=1
)

In [20]:
df_rouge_url["code_rouge_url"].value_counts()

code_rouge_url
[]                                                                                                                               72788
[https://github.com/]                                                                                                               92
[https://github.com/UKPLab/]                                                                                                         4
[https://github.com/google-research/]                                                                                                4
[https://github.com/huggingface/]                                                                                                    4
                                                                                                                                 ...  
[https://github.com/passeul/]                                                                                                        1
[https://github.com/manikbhandari/Revisi

#### Does the code mention ROUGE?

In [None]:
# Extract GitHub repository names from URLs
def extract_github_repo_names(urls):
    return ["/".join(urlparse(url).path.strip("/").split("/")[:2]) for url in urls if "github.com" in urlparse(url).netloc]

# Apply the function to extract GitHub repository names only if 'paper_rouge_prelim' is True
df_rouge_url['code_rouge_github'] = df_rouge_url.apply(lambda row: extract_github_repo_names(row['code_rouge_url']) if row['paper_rouge_prelim'] else [], axis=1)

## Reproducibility

In [22]:
df_rouge_reproducible = df_rouge_url.copy()

# Initialize the 'reproducible' column as a nullable boolean
df_rouge_reproducible['reproducible'] = pd.NA

In [24]:
# R1: Check if both packages and params are not null
condition_r1 = df_rouge_reproducible['paper_rouge_packages'].notna() & df_rouge_reproducible['paper_rouge_params'].notna()
df_rouge_reproducible.loc[condition_r1, 'reproducible'] = True

df_rouge_reproducible['reproducible'].value_counts()

reproducible
True    25
Name: count, dtype: int64

In [26]:
# R2: Check for no configuration packages
# Define the no configuration packages list
no_config_packages = ['MS/rouge', 'GL/seq2seq']

def check_reproducibility(row):
    # Only modify if reproducible is False or pd.NA
    if row['reproducible'] is False or pd.isna(row['reproducible']):
        # Check if paper_meteor_packages is a list and not empty or NA
        if isinstance(row['paper_rouge_packages'], list) and row['paper_rouge_packages']:
            # Check if any package in the list requires no configuration
            if any(pkg in no_config_packages for pkg in row['paper_rouge_packages']):
                return True
    return row['reproducible']

# Apply the function to update the 'reproducible' column
df_rouge_reproducible['reproducible'] = df_rouge_reproducible.apply(check_reproducibility, axis=1)

df_rouge_reproducible['reproducible'].value_counts()

reproducible
True    62
Name: count, dtype: int64

In [None]:
df_rouge_reproducible.to_pickle("rouge_paper_review.pkl")

# Export

In [None]:
def save_dataset(df: pd.DataFrame, filepath: str = "meteorscores/data/rouge_papers.jsonl.gz") -> None:
    """
    Save the DataFrame to a .jsonl.gz file.
    
    Parameters:
    - df: The DataFrame to save.
    - filepath: The file path where the DataFrame should be saved.
    """
    try:
        df.to_json(filepath, orient="records", lines=True, compression="gzip")
        print(f"Dataset successfully saved to {filepath}")
    except Exception as e:
        print(f"Could not save dataset: {e}")

save_dataset(df_rouge_reproducible, "meteorscores/data/rouge_papers.jsonl.gz")