In [1]:
!pip install requests beautifulsoup4



## Read the file and check the data information

In [4]:
import pandas as pd

file_path = "C:/Users/romeo/AAAI'25/datasets/arXiv_2023.csv"

data = pd.read_csv(file_path)
data.head()

paper_title = []
for i in data['title']:
  paper_title.append(i)
print(paper_title[:1000])

On non-trivial $\Lambda$-submodules with finite index of the plus/minus
  Selmer group over anticyclotomic $\mathbb{Z}_{p}$-extension at inert primes


## Scrape Citation data from google scholar

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import random
import os

# --- ScraperAPI Configuration ---
# Replace with your actual ScraperAPI key
SCRAPERAPI_API_KEY = "5136e8b4d0363b3615ec5e6bb80aef82"
# The base URL for ScraperAPI's proxy endpoint
SCRAPERAPI_BASE_URL = "http://api.scraperapi.com/"

# --- Helper Function for Making ScraperAPI Requests ---

def make_scraperapi_request(target_url, params=None, max_retries=1):
    """
    Makes a request to a target URL via ScraperAPI, with retries and exponential backoff.
    """
    if not SCRAPERAPI_API_KEY or SCRAPERAPI_API_KEY == "YOUR_SCRAPERAPI_KEY_HERE":
        print("ERROR: ScraperAPI key is missing or is the placeholder. Please set SCRAPERAPI_API_KEY.")
        return None

    scraperapi_params = {
        "api_key": SCRAPERAPI_API_KEY,
        "url": target_url,
        "render": "true" 
    }

    if params:
        scraperapi_params.update(params)

    for attempt in range(max_retries):
        try:
            print(f"  Attempt {attempt + 1}/{max_retries} for ScraperAPI URL: {target_url[:100]}...")
            response = requests.get(SCRAPERAPI_BASE_URL, params=scraperapi_params)
            response.raise_for_status() 
            return response
        except requests.exceptions.RequestException as e:
            print(f"  ScraperAPI request failed for {target_url[:100]}...: {e}")
            if response is not None:
                print(f"  ScraperAPI Response Status: {response.status_code}")
                print(f"  ScraperAPI Response Content (first 200 chars): {response.text[:200]}")
            
            time.sleep(random.uniform(10 * (attempt + 1), 20 * (attempt + 1)))
    print(f"Failed to retrieve {target_url} via ScraperAPI after {max_retries} attempts.")
    return None

# --- Scraping Functions (Now using ScraperAPI) ---

def scrape_cited_by_articles_via_scraperapi(cited_by_gs_url, max_citations=50):
    if not cited_by_gs_url or cited_by_gs_url == "N/A":
        return []

    citing_articles_data = []
    current_citations_count = 0
    next_page_gs_url = cited_by_gs_url

    while next_page_gs_url and current_citations_count < max_citations:
        print(f"  Scraping cited-by page: {next_page_gs_url[:100]}...")
        response = make_scraperapi_request(next_page_gs_url)

        if response is None:
            print(f"  Failed to get response for cited articles from {next_page_gs_url}. Stopping.")
            break

        soup = BeautifulSoup(response.text, 'html.parser')

        cited_article_containers = soup.find_all('div', class_='gs_ri')

        if not cited_article_containers:
            print(f"  DEBUG: No article containers ('gs_ri') found on cited-by page: {next_page_gs_url}")
            # Check for CAPTCHA if no containers are found, even though ScraperAPI should handle it
            if soup.find('div', id='gs_captcha_c') or soup.find('div', class_='g-recaptcha'):
                print("  DEBUG: Possible CAPTCHA page detected on cited-by page.")
            break # No articles means no more pages or an issue occurred

        for container in cited_article_containers:
            if current_citations_count >= max_citations:
                break

            title_element = container.find('h3', class_='gs_rt')
            title = title_element.text.strip() if title_element else "N/A"

            authors_element = container.find('div', class_='gs_a')
            authors = authors_element.text.strip().split(' - ')[0] if authors_element else "N/A"

            abstract_element = container.find('div', class_='gs_rs')
            abstract = abstract_element.text.strip() if abstract_element else "N/A"

            keywords = "N/A (often not directly available on results page)"

            citing_articles_data.append({
                "title": title,
                "authors": authors,
                "abstract": abstract,
                "keywords": keywords
            })
            current_citations_count += 1
            print(f"    Collected Citing Article {current_citations_count}:")
            print(f"      Title: {title}")
            print(f"      Authors: {authors}")
            print(f"      Abstract: {abstract}")
            print(f"      Keywords: {keywords}")

        # Find the "Next" page link for pagination
        next_button = soup.find('a', class_='gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb', string='Next')
        
        # --- DEBUGGING PRINT FOR PAGINATION ---
        if next_button:
            next_page_gs_url = "https://scholar.google.com" + next_button['href']
            print(f"  DEBUG: Found Next button. Next page URL: {next_page_gs_url[:100]}...")
            time.sleep(random.uniform(3, 7))
        else:
            next_page_gs_url = None
            print("  DEBUG: No 'Next' button found on current cited-by page. Ending pagination.")
        # --- END DEBUGGING PRINT ---

    return citing_articles_data

def scrape_main_scholar_details_via_scraperapi(query):
    """
    Searches Google Scholar for a given query (title) and extracts details
    for the top result using ScraperAPI.
    Returns: A dictionary with scraped details or None if not found/error.
    """
    google_scholar_search_url = f"https://scholar.google.com/scholar?q={requests.utils.quote(query)}"

    response = make_scraperapi_request(google_scholar_search_url)

    if response is None:
        print(f"  Failed to get response for main paper '{query}'.")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    container = soup.find('div', class_='gs_ri')

    if container:
        title_element = container.find('h3', class_='gs_rt')
        scraped_title = title_element.text.strip() if title_element else "N/A"

        authors_element = container.find('div', class_='gs_a')
        authors = authors_element.text.strip().split(' - ')[0] if authors_element else "N/A"

        abstract_element = container.find('div', class_='gs_rs')
        scraped_abstract = abstract_element.text.strip() if abstract_element else "N/A"

        keywords = "N/A (often not directly available on Google Scholar search results)"

        cited_by_link_element = container.find('a', string=lambda t: t and 'Cited by' in t)

        cited_by_url = "N/A"
        if cited_by_link_element and 'href' in cited_by_link_element.attrs:
            cited_by_url = "https://scholar.google.com" + cited_by_link_element['href']
        else:
            print(f"  DEBUG: 'Cited by' link element not found for '{query}'.")
            if scraped_title != "N/A":
                print(f"  DEBUG: Main article '{scraped_title}' found, but 'Cited by' link is missing.")

        return {
            "scraped_main_title_from_scholar": scraped_title,
            "scraped_main_authors": authors,
            "scraped_main_abstract": scraped_abstract,
            "scraped_main_keywords": keywords,
            "cited_by_url": cited_by_url
        }
    else:
        print(f"  No expected search results container ('gs_ri') found for query: '{query}' on the page returned by ScraperAPI. This might mean the paper wasn't found or the page structure changed.")
        if soup.find('div', id='gs_captcha_c') or soup.find('div', class_='g-recaptcha'):
            print("  Likely CAPTCHA detected by ScraperAPI. ScraperAPI should handle this, if it persists, check ScraperAPI dashboard.")
        return None

# --- Main Execution Logic ---

def main():
    if SCRAPERAPI_API_KEY == "YOUR_SCRAPERAPI_KEY_HERE":
        print("ERROR: Please update SCRAPERAPI_API_KEY with your actual ScraperAPI key.")
        return

    ## Load the CSV file
    # try:
    #     data = pd.read_csv("C:/Users/romeo/AAAI'25/raw_dataset/pubmed_2023.csv", usecols=['title', 'abstract']) #['title', 'abstract']  
    #     papers_from_csv = data.to_dict(orient='records')
    # except FileNotFoundError:
    #     print("Error: '/content/arXiv_2023.csv' not found. Please check the file path.")
    #     return
    # except KeyError:
    #     print("Error: 'title' or 'abstract' column not found in CSV. Please ensure correct column names.")
    #     return
    # except Exception as e:
    #     print(f"Error loading CSV file: {e}")
    #     return
    

    ## If the file is actually JSON
    try:
        csv_path = "C:/Users/romeo/AAAI'25/raw_dataset/elsevier.json" # Still use this path
        
        # --- MODIFIED: Use pd.read_json with lines=True ---
        data = pd.read_json(csv_path, lines=True)
        # Assuming 'Title' and 'Abstract' are the correct column names in this JSON structure
        papers_from_csv = data[['title', 'abstract']].to_dict(orient='records')
        print(f"Successfully loaded {len(papers_from_csv)} papers from '{csv_path}' as JSON Lines file.")
    except FileNotFoundError:
        print(f"Error: File not found at '{csv_path}'. Please check the file path.")
        return
    except KeyError as e:
        print(f"Error: Column {e} not found in the JSON data. Please ensure 'Title' and 'Abstract' columns exist (case-sensitive).")
        return
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from '{csv_path}': {e}. Check if the file is correctly formatted JSON.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while loading data: {e}")
        return

    papers_to_process = papers_from_csv[:1000] 

    all_scraped_data = []

    for idx, paper_info in enumerate(papers_to_process):
        csv_title = paper_info.get('title', 'N/A')
        csv_abstract = paper_info.get('abstract', 'N/A')
        

        print(f"\n--- Processing Paper {idx + 1}/{len(papers_to_process)} from CSV ---")
        print(f"  CSV Title: {csv_title}")
        print(f"  CSV Abstract: {csv_abstract}")

        scholar_details = scrape_main_scholar_details_via_scraperapi(csv_title)

        if scholar_details:
            combined_paper_data = {
                "original_csv_title": csv_title,
                "original_csv_abstract": csv_abstract,
                "scraped_main_authors": scholar_details['scraped_main_authors'],
                "scraped_main_keywords": scholar_details['scraped_main_keywords'],
                "scraped_main_title_from_scholar": scholar_details['scraped_main_title_from_scholar'],
                "scraped_main_abstract_from_scholar": scholar_details['scraped_main_abstract'],
                "cited_by_url": scholar_details['cited_by_url'],
                "citing_articles": []
            }

            if combined_paper_data['cited_by_url'] != "N/A":
                print(f"  Initiating scrape for cited articles (max 50) from: {combined_paper_data['cited_by_url']}")
                cited_articles = scrape_cited_by_articles_via_scraperapi(combined_paper_data['cited_by_url'], max_citations=50)
                combined_paper_data['citing_articles'] = cited_articles
                print(f"  Collected {len(cited_articles)} citing articles in total for '{csv_title}'.")
                time.sleep(random.uniform(5, 10))
            else:
                print("  No 'Cited by' link found for this main article on Google Scholar.")

            all_scraped_data.append(combined_paper_data)
        else:
            print(f"  Could not find Google Scholar details for '{csv_title}'. Skipping cited articles for this paper.")

        time.sleep(random.uniform(5, 15))


    #Set output path
    output_dir = r"C:/Users/romeo/AAAI'25/citation_datasets"
    output_filename = "Elsevier_citation_dataset.json"
    output_path = os.path.join(output_dir, output_filename)

    # Save JSON
    try:
        os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
        print(f"\n✅ Scraping complete! All data saved to: {output_path}")
    except Exception as e:
        print(f"❌ Error saving data to JSON file: {e}")

    # output_filename = 'arXiv_citation_dataset.json'
    # try:
    #     with open(output_filename, 'w', encoding='utf-8') as f:
    #         json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
    #     print(f"\nScraping complete! All data saved to '{output_filename}'")
    # except Exception as e:
    #     print(f"Error saving data to JSON file: {e}")

if __name__ == "__main__":
    main()

Successfully loaded 32072 papers from 'C:/Users/romeo/AAAI'25/raw_dataset/elsevier.json' as JSON Lines file.

--- Processing Paper 1/1000 from CSV ---
  CSV Title: Nonlinear non-collinear ultrasonic detection and characterisation of kissing bonds
  CSV Abstract: The development of cost effective and reliable bonded structures ideally requires an NDT method to detect the presence of poor quality, weak bonds or kissing bonds. If these bonds are more compliant in tension than in compression stress-strain nonlinearities provide a possible route to detection with the use of nonlinear ultrasonic techniques. This paper focuses on the kissing bond case and the resulting contact acoustic nonlinearity of the interface. A kissing bond is created by compression loading of two aluminium blocks. Non-collinear mixing of two shear waves producing a sum frequency longitudinal wave is the method of stimulation of contact acoustic nonlinearity in this research. The parametric space of the nonlinear mixin

KeyboardInterrupt: 

In [7]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd
# import json
# import time
# import random
# import os
# import re

# # --- ScraperAPI Configuration ---
# # Replace with your actual ScraperAPI key
# SCRAPERAPI_API_KEY = "5136e8b4d0363b3615ec5e6bb80aef82"
# # The base URL for ScraperAPI's proxy endpoint
# SCRAPERAPI_BASE_URL = "http://api.scraperapi.com/"

# # --- Helper Function for Making ScraperAPI Requests ---

# def make_scraperapi_request(target_url, params=None, max_retries=3):
#     """
#     Makes a request to a target URL via ScraperAPI, with retries and exponential backoff.
#     """
#     if not SCRAPERAPI_API_KEY or SCRAPERAPI_API_KEY == "YOUR_SCRAPERAPI_KEY_HERE":
#         print("ERROR: ScraperAPI key is missing or is the placeholder. Please set SCRAPERAPI_API_KEY.")
#         return None

#     scraperapi_params = {
#         "api_key": SCRAPERAPI_API_KEY,
#         "url": target_url,
#         "render": "true"
#     }

#     if params:
#         scraperapi_params.update(params)

#     for attempt in range(max_retries):
#         try:
#             print(f"  Attempt {attempt + 1}/{max_retries} for ScraperAPI URL: {target_url[:100]}...")
#             response = requests.get(SCRAPERAPI_BASE_URL, params=scraperapi_params)
#             response.raise_for_status()
#             return response
#         except requests.exceptions.RequestException as e:
#             print(f"  ScraperAPI request failed for {target_url[:100]}...: {e}")
#             if response is not None:
#                 print(f"  ScraperAPI Response Status: {response.status_code}")
#                 print(f"  ScraperAPI Response Content (first 200 chars): {response.text[:200]}")

#             time.sleep(random.uniform(10 * (attempt + 1), 20 * (attempt + 1)))
#     print(f"Failed to retrieve {target_url} via ScraperAPI after {max_retries} attempts.")
#     return None

# # --- Function to parse author/publication/year string ---
# def parse_author_info(author_string):
#     authors = "N/A"
#     publish_site = "N/A"
#     year = "N/A"

#     if author_string and author_string != "N/A": # Ensure string is not empty or "N/A"
#         parts = author_string.split(' - ')
#         authors = parts[0].strip()

#         if len(parts) > 1:
#             publication_info = parts[1].strip()
#             # Try to extract year from the end of the publication info
#             year_match = re.search(r'(\d{4})$', publication_info)
#             if year_match:
#                 year = year_match.group(1)
#                 # Remove the year from the publication info to get just the site
#                 publish_site = publication_info[:year_match.start()].strip(',').strip()
#             else:
#                 publish_site = publication_info
#     return authors, publish_site, year

# # --- Scraping Functions (Now using ScraperAPI) ---

# def scrape_cited_by_articles_via_scraperapi(cited_by_gs_url, max_citations=50):
#     if not cited_by_gs_url or cited_by_gs_url == "N/A":
#         return []

#     citing_articles_data = []
#     current_citations_count = 0
#     next_page_gs_url = cited_by_gs_url

#     while next_page_gs_url and current_citations_count < max_citations:
#         print(f"  Scraping cited-by page: {next_page_gs_url[:100]}...")
#         response = make_scraperapi_request(next_page_gs_url)

#         if response is None:
#             print(f"  Failed to get response for cited articles from {next_page_gs_url}. Stopping.")
#             break

#         soup = BeautifulSoup(response.text, 'html.parser')

#         cited_article_containers = soup.find_all('div', class_='gs_ri')

#         if not cited_article_containers:
#             print(f"  DEBUG: No article containers ('gs_ri') found on cited-by page: {next_page_gs_url}")
#             # Check for CAPTCHA if no containers are found, even though ScraperAPI should handle it
#             if soup.find('div', id='gs_captcha_c') or soup.find('div', class_='g-recaptcha'):
#                 print("  DEBUG: Possible CAPTCHA page detected on cited-by page.")
#             break # No articles means no more pages or an issue occurred

#         for container in cited_article_containers:
#             if current_citations_count >= max_citations:
#                 break

#             title_element = container.find('h3', class_='gs_rt')
#             title = title_element.text.strip() if title_element else "N/A"

#             # Parse author information
#             authors_element = container.find('div', class_='gs_a')
#             author_string = authors_element.text.strip() if authors_element else "N/A"
#             authors, publish_site, year = parse_author_info(author_string)

#             abstract_element = container.find('div', class_='gs_rs')
#             abstract = abstract_element.text.strip() if abstract_element else "N/A"

#             keywords = "N/A (often not directly available on results page)"

#             citing_articles_data.append({
#                 "title": title,
#                 "authors": authors,
#                 "publish_site": publish_site,
#                 "year": year,
#                 "abstract": abstract,
#                 "keywords": keywords
#             })
#             current_citations_count += 1
#             print(f"    Collected Citing Article {current_citations_count}:")
#             print(f"      Title: {title}")
#             print(f"      Authors: {authors}")
#             print(f"      Publish Site: {publish_site}")
#             print(f"      Year: {year}")
#             print(f"      Abstract: {abstract[:100]}...")
#             print(f"      Keywords: {keywords}")

#         # Find the "Next" page link for pagination
#         next_button = soup.find('a', class_='gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb', string='Next')

#         if next_button:
#             next_page_gs_url = "https://scholar.google.com" + next_button['href']
#             print(f"  DEBUG: Found Next button. Next page URL: {next_page_gs_url[:100]}...")
#             time.sleep(random.uniform(3, 7))
#         else:
#             next_page_gs_url = None
#             print("  DEBUG: No 'Next' button found on current cited-by page. Ending pagination.")

#     return citing_articles_data

# def scrape_main_scholar_details_via_scraperapi(query):
#     """
#     Searches Google Scholar for a given query (title) and extracts details
#     for the top result using ScraperAPI.
#     Returns: A dictionary with scraped details or None if not found/error.
#     """
#     query_str = str(query).strip()
#     if not query_str:
#         print("  Warning: Empty query string received for Google Scholar search.")
#         return None

#     google_scholar_search_url = f"https://scholar.google.com/scholar?q={requests.utils.quote(query_str)}"

#     response = make_scraperapi_request(google_scholar_search_url)

#     if response is None:
#         print(f"  Failed to get response for main paper '{query_str}'.")
#         return None

#     soup = BeautifulSoup(response.text, 'html.parser')

#     container = soup.find('div', class_='gs_ri')

#     if container:
#         title_element = container.find('h3', class_='gs_rt')
#         scraped_title = title_element.text.strip() if title_element else "N/A"

#         authors_element = container.find('div', class_='gs_a')
#         author_string = authors_element.text.strip() if authors_element else "N/A"
#         scraped_main_authors, scraped_main_publish_site, scraped_main_year = parse_author_info(author_string)

#         abstract_element = container.find('div', class_='gs_rs')
#         scraped_abstract = abstract_element.text.strip() if abstract_element else "N/A"

#         keywords = "N/A (often not directly available on Google Scholar search results)"

#         cited_by_link_element = container.find('a', string=lambda t: t and 'Cited by' in t)

#         cited_by_url = "N/A"
#         if cited_by_link_element and 'href' in cited_by_link_element.attrs:
#             cited_by_url = "https://scholar.google.com" + cited_by_link_element['href']
#         else:
#             print(f"  DEBUG: 'Cited by' link element not found for '{query_str}'.")
#             if scraped_title != "N/A":
#                 print(f"  DEBUG: Main article '{scraped_title}' found, but 'Cited by' link is missing.")

#         return {
#             "scraped_main_title_from_scholar": scraped_title,
#             "scraped_main_authors": scraped_main_authors,
#             "scraped_main_publish_site": scraped_main_publish_site,
#             "scraped_main_year": scraped_main_year,
#             "scraped_main_abstract": scraped_abstract,
#             "scraped_main_keywords": keywords,
#             "cited_by_url": cited_by_url
#         }
#     else:
#         print(f"  No expected search results container ('gs_ri') found for query: '{query_str}' on the page returned by ScraperAPI. This might mean the paper wasn't found or the page structure changed.")
#         if soup.find('div', id='gs_captcha_c') or soup.find('div', class_='g-recaptcha'):
#             print("  Likely CAPTCHA detected by ScraperAPI. ScraperAPI should handle this, if it persists, check ScraperAPI dashboard.")
#         return None

# # --- Main Execution Logic ---

# def main():
#     if SCRAPERAPI_API_KEY == "YOUR_SCRAPERAPI_KEY_HERE":
#         print("ERROR: Please update SCRAPERAPI_API_KEY with your actual ScraperAPI key.")
#         return

#     try:
#         csv_path = "C:/Users/romeo/AAAI'25/raw_dataset/elsevier.json" 
#         #data = pd.read_csv(csv_path, usecols=['Title', 'Abstract'])
#         data = pd.read_csv(csv_path, usecols=['title', 'abstract'])
#         papers_from_csv = data.to_dict(orient='records')
#         print(f"Successfully loaded {len(papers_from_csv)} papers from '{csv_path}'.")
#     except FileNotFoundError:
#         print(f"Error: CSV file not found at '{csv_path}'. Please check the file path.")
#         return
#     except KeyError as e:
#         print(f"Error: Column {e} not found in CSV. Please ensure 'Title' and 'Abstract' columns exist and are spelled correctly (case-sensitive).")
#         return
#     except Exception as e:
#         print(f"Error loading CSV file: {e}")
#         return

#     papers_to_process = papers_from_csv[836:1010]
#     print(f"Processing {len(papers_to_process)} papers from the CSV slice (indices 836 to 1009).")

#     all_scraped_data = []

#     output_directory = r"C:/Users/romeo/AAAI'25/citation_datasets"
#     output_filename = "Elsevier_citation_dataset.json"
#     output_filepath = os.path.join(output_directory, output_filename)

#     try:
#         os.makedirs(output_directory, exist_ok=True)
#         print(f"\nEnsured output directory '{output_directory}' exists for saving.")
#     except Exception as e:
#         print(f"Error creating output directory '{output_directory}': {e}")
#         print("Attempting to save to current working directory instead.")
#         output_filepath = output_filename

#     for idx, paper_info in enumerate(papers_to_process):
#         csv_title = str(paper_info.get('Title', 'N/A')).strip()
#         csv_abstract = str(paper_info.get('Abstract', 'N/A')).strip()

#         print(f"\n--- Processing Paper {idx + 1}/{len(papers_to_process)} from CSV Slice ---")
#         print(f"  CSV Title: {csv_title[:100]}...")
#         print(f"  CSV Abstract: {csv_abstract[:100]}...")

#         # --- MODIFIED: Skip if csv_title or csv_abstract is empty/invalid ---
#         if not csv_title or csv_title == 'N/A':
#             print(f"  Skipping paper {idx + 1} due to missing or invalid CSV Title.")
#             continue # Skip to the next paper
#         if not csv_abstract or csv_abstract == 'N/A':
#             print(f"  Skipping paper {idx + 1} due to missing or invalid CSV Abstract.")
#             continue # Skip to the next paper
#         # --- END MODIFIED ---

#         scholar_details = scrape_main_scholar_details_via_scraperapi(csv_title)

#         if scholar_details:
#             combined_paper_data = {
#                 "original_csv_title": csv_title,
#                 "original_csv_abstract": csv_abstract,
#                 "scraped_main_authors": scholar_details['scraped_main_authors'],
#                 "scraped_main_publish_site": scholar_details['scraped_main_publish_site'],
#                 "scraped_main_year": scholar_details['scraped_main_year'],
#                 "scraped_main_keywords": scholar_details['scraped_main_keywords'],
#                 "scraped_main_title_from_scholar": scholar_details['scraped_main_title_from_scholar'],
#                 "scraped_main_abstract_from_scholar": scholar_details['scraped_main_abstract'],
#                 "cited_by_url": scholar_details['cited_by_url'],
#                 "citing_articles": []
#             }

#             if combined_paper_data['cited_by_url'] != "N/A":
#                 print(f"  Initiating scrape for cited articles (max 50) from: {combined_paper_data['cited_by_url']}")
#                 cited_articles = scrape_cited_by_articles_via_scraperapi(combined_paper_data['cited_by_url'], max_citations=50)
#                 combined_paper_data['citing_articles'] = cited_articles
#                 print(f"  Collected {len(cited_articles)} citing articles in total for '{csv_title[:50]}...'.")
#                 time.sleep(random.uniform(5, 10))
#             else:
#                 print("  No 'Cited by' link found for this main article on Google Scholar.")

#             all_scraped_data.append(combined_paper_data)
#         else:
#             print(f"  Could not find Google Scholar details for '{csv_title[:50]}...'. Skipping cited articles for this paper.")

#         # Incremental Save - Save after each paper
#         try:
#             with open(output_filepath, 'w', encoding='utf-8') as f:
#                 json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
#             print(f"  Successfully saved {len(all_scraped_data)} papers incrementally to '{output_filepath}'")
#         except Exception as e:
#             print(f"  ERROR: Problem saving data incrementally to '{output_filepath}': {e}")

#         time.sleep(random.uniform(5, 15))

#     print(f"\nScraping process complete for all {len(papers_to_process)} papers in the slice.")
#     try:
#         with open(output_filepath, 'w', encoding='utf-8') as f:
#             json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
#         print(f"\n✅ Final data saved to: {output_filepath}")
#     except Exception as e:
#         print(f"❌ Error saving final data to JSON file: {e}")

# if __name__ == "__main__":
#     main()

Successfully loaded 931817 papers from 'C:/Users/romeo/AAAI'25/raw_dataset/pubmed_2023.csv'.
Processing 174 papers from the CSV slice (indices 836 to 1009).

Ensured output directory 'C:/Users/romeo/AAAI'25/citation_datasets' exists for saving.

--- Processing Paper 1/174 from CSV Slice ---
  CSV Title: nan...
  CSV Abstract: nan...
  Attempt 1/3 for ScraperAPI URL: https://scholar.google.com/scholar?q=nan...
  Initiating scrape for cited articles (max 50) from: https://scholar.google.com/scholar?cites=17666499735183724620&as_sdt=5,39&sciodt=0,39&hl=en
  Scraping cited-by page: https://scholar.google.com/scholar?cites=17666499735183724620&as_sdt=5,39&sciodt=0,39&hl=en...
  Attempt 1/3 for ScraperAPI URL: https://scholar.google.com/scholar?cites=17666499735183724620&as_sdt=5,39&sciodt=0,39&hl=en...
    Collected Citing Article 1:
      Title: NTIRE 2023 challenge on efficient super-resolution: Methods and results
      Authors: Y Li, Y Zhang, R Timofte, L Van Gool… - Proceedings of the 

## Processing Raw citation Data 

In [None]:
import re
import json

# Read the raw DBLP scrape log
with open('/content/Pubmed-1.txt', 'r') as f:
    raw = f.read()

# Split into individual paper blocks
blocks = re.split(r'--- Processing Paper \d+/\d+ from CSV ---', raw)[1:]

papers = []

for block in blocks:
    # Extract CSV title and abstract
    title_match = re.search(r'CSV Title: Title: (.+?)\n', block)
    abstract_match = re.search(r'CSV Abstract: (.+?)\n', block)
    title = title_match.group(1).strip() if title_match else None
    abstract = abstract_match.group(1).strip() if abstract_match else None

    # Extract the 'cites' URL if present
    cites_match = re.search(r'Initiating scrape for cited articles.*from: (https://[^ ]+)', block)
    cited_by_url = cites_match.group(1) if cites_match else None

    # Extract citing articles
    citing = []
    for match in re.finditer(
        r'Collected Citing Article \d+:.*?\n\s+Title: (.+?)\n\s+Authors: (.+?)\n\s+Abstract: (.+?)\n\s+Keywords: (.+?)\n',
        block, re.DOTALL
    ):
        citing.append({
            "title": match.group(1).strip(),
            "authors": match.group(2).strip(),
            "abstract": match.group(3).strip(),
            "keywords": match.group(4).strip()
        })

    papers.append({
        "original_csv_title": title,
        "original_csv_abstract": abstract,
        "cited_by_url": cited_by_url,
        "citing_articles": citing
    })

# Write to JSON file
output_path = 'PubMed_1.json'
with open(output_path, 'w') as out:
    json.dump(papers, out, indent=2)

# Display a preview of the first few entries
import pandas as pd
df_preview = pd.DataFrame([{"title": p["original_csv_title"], "cited_by_url": p["cited_by_url"], "num_citing": len(p["citing_articles"])} for p in papers[:10]])

