###  Filtering Scientific Papers Based on Specific Keywords in Title or Abstract - final 

##### inlcuding regex: Replaces \n and \r with spaces for matching purposes.

In [18]:
import json
import re

"""
    This Python script filters scientific papers from a JSON dataset based on the presence of specified 
    keywords in either the title or abstract. The filtering is performed using regular expressions for 
    efficient matching. The filtered papers are then exported to a new JSON file for further analysis.
"""
#keywords to filter the papers.
keywords = [
    "llm", "llms", "language model", "large language", "chatgpt", "llama", "openai", "gpt",
    "large-scale pre-trained language", "large-scale languag", "deepmind", "bert", "few shot learning", 
    "zero shot learning", "t5"
]
#regex
keyword_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
with open('data_collected_Jan2022-July2022.json', encoding='utf-8') as f:
    papers = json.load(f)

def normalize_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').strip()
filtered_papers = [
    paper for paper in papers
    if keyword_pattern.search(normalize_text(paper.get('title', '') + ' ' + normalize_text(paper.get('summary', ''))))
]
with open('data_collected_Jan2022-July2022_filtered.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_papers, f, indent=4)
print(f"Filtered {len(filtered_papers)} papers mentioning the keywords.")


Filtered 1155 papers mentioning the keywords.


###### Making sure that there are multiple keywords to reduce the False Positives

In [4]:
import json
import re
import os

"""
    This Python script filters scientific papers from a JSON dataset based on the presence of specified 
    keywords in either the title or abstract. The filtering is performed using regular expressions for 
    efficient matching. The filtered papers are then exported to a new JSON file for further analysis.
"""
keywords = [
    "llm", "llms", "large language model", "large language models", "chatgpt", "llama", "openai", "gpt",
    "large-scale pre-trained language model", "large-scale pre-trained language models", "large-scale language model", 
    "large-scale language models", "deepmind", "bert", "few-shot learning", "zero-shot learning", "t5", 
    "transformer model", "transformer models", "roberta", "xlm", "generative model", "generative models"
]
keyword_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
input_path = os.path.expanduser('~/Master Thesis Draft/acl_data/naacl2024.json')
output_path = os.path.expanduser('~/Master Thesis Draft/acl_data/naacl2024_filtered.json')
with open(input_path, encoding='utf-8') as f:
    papers = json.load(f)
def normalize_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').strip()
def filter_papers_by_keywords(papers, keyword_pattern):
    filtered_papers = []
    for paper in papers:
        title = normalize_text(paper.get('title', ''))
        summary = normalize_text(paper.get('summary', ''))
        matches_in_title = keyword_pattern.findall(title)
        matches_in_summary = keyword_pattern.findall(summary)

        if len(matches_in_title) + len(matches_in_summary) >= 2:
            filtered_papers.append(paper)
    
    return filtered_papers
filtered_papers = filter_papers_by_keywords(papers, keyword_pattern)
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(filtered_papers, f, ensure_ascii=False, indent=4)

print(f"Filtered {len(filtered_papers)} papers mentioning the keywords.")


Filtered 271 papers mentioning the keywords.
