## Download the dataset (title + abstract)

The titles are in the paper folder while the abstracts are in the abstract folder.


In [None]:
"""
This code downloads and processes academic paper data from the Semantic Scholar API. Here's what it does:

1. Authentication:
- Uses an API key to authenticate with Semantic Scholar API
- Creates a local output directory './s2ag_dataset_title_abstracts'

2. Data Download:
- Gets the latest data release ID
- Downloads two datasets: 'papers' and 'abstracts' 
- Saves the downloaded files as gzipped files in dataset-specific subdirectories:
  - ./s2ag_dataset_title_abstracts/papers/papers_*.gz
  - ./s2ag_dataset_title_abstracts/abstracts/abstracts_*.gz

3. Data Processing:
- Extracts titles and abstracts specifically for Computer Science papers
- Matches papers with their abstracts using corpusid
- Creates a pandas DataFrame with matched title-abstract pairs

4. Output Files:
- Gzipped raw data files in the papers/ and abstracts/ subdirectories
- Final CSV file (csv_output_path) containing:
  - Column 'title': Paper titles
  - Column 'abstract': Corresponding paper abstracts
  - Only includes Computer Science papers that have both title and abstract

Note: The csv_output_path variable appears to be undefined in the code snippet,
but the final CSV would contain the processed title-abstract pairs.
"""

import requests
import json
import os
import gzip
import pandas as pd

# Set your S2 API key
api_key = ""

# Headers for authentication
headers = {
    "x-api-key": api_key
}

# Base output directory locally
base_output_dir = "./s2ag_dataset_title_abstracts"
os.makedirs(base_output_dir, exist_ok=True)


# Step 1: Get the latest release ID
latest_release_url = "https://api.semanticscholar.org/datasets/v1/release/latest"
response = requests.get(latest_release_url, headers=headers)
if response.status_code != 200:
    print(f"Failed to fetch latest release: {response.status_code} - {response.text}")
    exit()

latest_release = response.json()
release_id = latest_release["release_id"]
print("Latest release ID:", release_id)

# Step 2: Datasets to download
datasets = ["papers", "abstracts"]

# Dictionary to store data
data = {"title": [], "abstract": []}

for dataset_name in datasets:
    # Create dataset-specific directory
    output_dir = os.path.join(base_output_dir, dataset_name)
    os.makedirs(output_dir, exist_ok=True)

    # Step 3: Get metadata for the dataset
    dataset_url = f"https://api.semanticscholar.org/datasets/v1/release/{release_id}/dataset/{dataset_name}"
    response = requests.get(dataset_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch metadata for {dataset_name}: {response.status_code} - {response.text}")
        continue

    dataset_info = response.json()
    print(f"\n{dataset_name.capitalize()} dataset metadata:")
    print(json.dumps(dataset_info, indent=2))

    # Step 4: Download all dataset files with simplified file names
    for index, file_url in enumerate(dataset_info["files"]):  # Process all files
        # Generate a shorter file name (e.g., papers_0.gz, abstracts_1.gz)
        file_name = f"{dataset_name}_{index}.gz"
        output_path = os.path.join(output_dir, file_name)

        print(f"Downloading {dataset_name}/{file_name}...")
        file_response = requests.get(file_url, headers=headers, stream=True)

        if file_response.status_code == 200:
            with open(output_path, "wb") as f:
                for chunk in file_response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"Saved {file_name} to {output_path}")
        else:
            print(f"Failed to download {file_name}: {file_response.status_code} - {file_response.text}")

# Step 5: Process the datasets to extract titles and abstracts for Computer Science papers
print("\nProcessing datasets to extract titles and abstracts for Computer Science papers...")

# Dictionary to store titles and abstracts by corpusid
title_dict = {}
abstract_dict = {}

# Process papers dataset (for titles and field of study)
papers_dir = os.path.join(base_output_dir, "papers")
if os.path.exists(papers_dir):
    for file_name in os.listdir(papers_dir):
        if file_name.endswith(".gz"):
            file_path = os.path.join(papers_dir, file_name)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            corpusid = record.get("corpusid")
                            title = record.get("title")
                            fields_of_study = record.get("fieldsOfStudy", [])
                            # Filter for Computer Science
                            if corpusid and title and "Computer Science" in fields_of_study:
                                title_dict[corpusid] = title
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON in {file_name}: {e}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
else:
    print("Papers directory not found. Skipping paper processing.")

# Process abstracts dataset
abstracts_dir = os.path.join(base_output_dir, "abstracts")
if os.path.exists(abstracts_dir):
    for file_name in os.listdir(abstracts_dir):
        if file_name.endswith(".gz"):
            file_path = os.path.join(abstracts_dir, file_name)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            corpusid = record.get("corpusid")
                            abstract = record.get("abstract")
                            if corpusid and abstract:
                                abstract_dict[corpusid] = abstract
                        except json.JSONDecodeError as e:
                            print(f"Error decoding JSON in {file_name}: {e}")
            except Exception as e:
                print(f"Error reading {file_name}: {e}")
else:
    print("Abstracts directory not found. Skipping abstract processing.")

# Step 6: Combine titles and abstracts by corpusid for Computer Science papers
for corpusid in title_dict:
    if corpusid in abstract_dict:
        data["title"].append(title_dict[corpusid])
        data["abstract"].append(abstract_dict[corpusid])

# Step 7: Create DataFrame and save to CSV
if data["title"]:
    df = pd.DataFrame(data)
    print("\nSample of DataFrame (Computer Science papers):")
    print(df.head())

    # Save DataFrame to CSV
    df.to_csv(csv_output_path, index=False, encoding="utf-8")
    print(f"\nSaved DataFrame to {csv_output_path}")
else:
    print("\nNo matching title and abstract pairs found for Computer Science papers. CSV not created.")

print("\nProcessing complete!")

## DATASET CONSTRUCTION
Combine the datasets and filter English Only papers


In [None]:
import gzip
import json
import os
import re

# --- Config ---
papers_dir = "./s2ag_dataset_title_abstracts/papers/test_title"
abstracts_dir = "./s2ag_dataset_title_abstracts/abstracts/test_abstract"
output_dir = r"C://Users//Faisal Ramzan//Desktop//kmi_project_cso//paper_dataset"

titles_output_file = os.path.join(output_dir, "clean_titles.txt")
abstracts_output_file = os.path.join(output_dir, "clean_abstracts.txt")
merged_output_file = os.path.join(output_dir, "paper_dataset.txt")

buffer_size = 5000  # Optimal for performance

# --- Regex Patterns ---
LATEX_PATTERN = re.compile(r'\\usepackage|\\documentclass|\\begin\{document\}|\\cite\{|\\ref\{')
NON_ENGLISH_UNICODE_PATTERN = re.compile(r'[٠-٩\u0600-\u06FF\u4e00-\u9fff\uac00-\ud7af]')

def is_strictly_english(text: str, min_length: int = 10, min_words: int = 2) -> bool:
    if not text or len(text) < min_length:
        return False
    if NON_ENGLISH_UNICODE_PATTERN.search(text):
        return False
    if LATEX_PATTERN.search(text.lower()):
        return False
    text_sample = f" {text.lower()[:200]} "
    common_words = {
        "the", "this", "that", "we", "our", "an", "a", "and", "is", "are",
        "for", "with", "from", "by", "on", "of", "in", "to", "using", "can",
        "have", "has", "as", "be", "based", "new", "approach", "method",
        "study", "paper", "research", "results", "present", "analysis",
        "model", "data", "system", "algorithm", "network", "learning",
        "detection", "classification", "prediction", "optimization",
        "propose", "show", "demonstrate", "evaluate", "performance",
        "experimental", "implementation", "framework", "problem", "solution"
    }
    return sum(1 for word in common_words if f" {word} " in text_sample) >= min_words

# --- Step 1: Process Titles ---
def process_titles():
    print(" Processing English titles...")
    os.makedirs(output_dir, exist_ok=True)
    files = [f for f in os.listdir(papers_dir) if f.endswith(".gz")]
    total_written = 0
    buffer = []

    with open(titles_output_file, "w", encoding="utf-8", buffering=16384) as out_f:
        for file in files:
            file_path = os.path.join(papers_dir, file)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            title = record.get("title")
                            if title and isinstance(title, str):
                                title = title.strip()
                                if title and is_strictly_english(title):
                                    buffer.append(title + "\n")
                                    total_written += 1
                        except json.JSONDecodeError:
                            continue

                        if len(buffer) >= buffer_size:
                            out_f.writelines(buffer)
                            buffer.clear()
            except Exception as e:
                print(f" Error reading {file}: {e}")
                continue

        if buffer:
            out_f.writelines(buffer)

    print(f" Step 1 complete: {total_written:,} titles written to {titles_output_file}")

# --- Step 2: Process Abstracts ---
def process_abstracts():
    print("\n Processing English abstracts...")
    os.makedirs(output_dir, exist_ok=True)
    files = [f for f in os.listdir(abstracts_dir) if f.endswith(".gz")]
    total_written = 0
    buffer = []

    with open(abstracts_output_file, "w", encoding="utf-8", buffering=16384) as out_f:
        for file in files:
            file_path = os.path.join(abstracts_dir, file)
            try:
                with gzip.open(file_path, "rt", encoding="utf-8") as f:
                    for line in f:
                        try:
                            record = json.loads(line.strip())
                            abstract = record.get("abstract")
                            if abstract and isinstance(abstract, str):
                                abstract = abstract.strip()
                                if abstract and is_strictly_english(abstract, min_length=20, min_words=3):
                                    buffer.append(abstract + "\n")
                                    total_written += 1
                        except json.JSONDecodeError:
                            continue

                        if len(buffer) >= buffer_size:
                            out_f.writelines(buffer)
                            buffer.clear()
            except Exception as e:
                print(f" Error reading {file}: {e}")
                continue

        if buffer:
            out_f.writelines(buffer)

    print(f" Step 2 complete: {total_written:,} abstracts written to {abstracts_output_file}")

# --- Step 3: Merge Titles and Abstracts ---
def merge_titles_and_abstracts(titles_file_path, abstracts_file_path, output_file_path):
    print("\n Merging titles and abstracts into final output...")
    with open(titles_file_path, "r", encoding="utf-8") as title_f, \
         open(abstracts_file_path, "r", encoding="utf-8") as abstract_f, \
         open(output_file_path, "w", encoding="utf-8", buffering=16384) as output_f:

        count = 0
        while True:
            title = title_f.readline()
            abstract = abstract_f.readline()

            if not title or not abstract:
                break

            title_clean = title.replace('\n', ' ').strip()
            abstract_clean = abstract.replace('\n', ' ').strip()

            output_f.write(f"{title_clean}\r\n")
            output_f.write(f"{abstract_clean}\r\n")
            count += 1

    print(f" Step 3 complete: {count:,} title-abstract pairs written to {output_file_path}")

# --- Main ---
if __name__ == "__main__":
    print("Starting 3-step title + abstract processing pipeline...\n")
    process_titles()
    process_abstracts()
    merge_titles_and_abstracts(titles_output_file, abstracts_output_file, merged_output_file)
    print("\n Completed!")


Starting 3-step title + abstract processing pipeline...

 Processing English titles...


In [9]:
def print_first_100_lines(filepath):
    """
    Print the first 100 lines from a text file as-is.

    Args:
        filepath (str): Full path to the merged dataset file
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for i in range(100):
                line = f.readline()
                if not line:
                    print(f"\n[Reached end of file after {i} lines]\n")
                    break
                print(line.strip())
    except FileNotFoundError:
        print(f"⚠️ File not found: {filepath}")
    except Exception as e:
        print(f"⚠️ Error reading file: {e}")


In [10]:
merged_output_file = r"C://Users//Faisal Ramzan//Desktop//kmi_project_cso//paper_dataset//paper_dataset.txt"
print_first_100_lines(merged_output_file)


Vendor-Purchaser Coordination and Quantity Discount Pricing Model in Supply Chain
This paper presents a method to derive the Dyadic Green’s Function(DGF)ofa loaded rectangular waveguide by using the image method.In the calculation of the DGF,we use the integral transformation and replace the multi-infinite summation by a single one;thus it greatly simplifies the calculation and saves computer time.As an example of the DGF’sapplication,we give the moment method’s scattering field calculation of a metal sphere resting onthe broad wall of the loaded rectangular waveguide.Results of our calculations well agree withboth data of experiments performed in our laboratory and those are published.It is easy to seethat the method used in this paper can be expanded to other related waveguide problems.
A Comparative Analysis of Immigration Laws: Case Studies of the Canada, Germany, the United States of America (USA) and the United Kingdom (UK)
A study on the notion of covariant derivatives in flat a


Previewing the first 5 title-abstract pairs from:
C://Users//Faisal Ramzan//Desktop//kmi_project_cso//paper_dataset//paper_dataset.txt

📘 Title 1:
Vendor-Purchaser Coordination and Quantity Discount Pricing Model in Supply Chain
📄 Abstract 1:
This paper presents a method to derive the Dyadic Green’s Function(DGF)ofa loaded rectangular waveguide by using the image method.In the calculation of the DGF,we use the integral transformation and replace the multi-infinite summation by a single one;thus it greatly simplifies the calculation and saves computer time.As an example of the DGF’sapplication,we give the moment method’s scattering field calculation of a metal sphere resting onthe broad wall of the loaded rectangular waveguide.Results of our calculations well agree withboth data of experiments performed in our laboratory and those are published.It is easy to seethat the method used in this paper can be expanded to other related waveguide problems.
--------------------------------------