In [None]:
##SCRAPING CODES

#1.GOOGLE SCHOLAR: 
#This code extract data from HTML pages
#Consider selecting keywords carefully, as the code can break if you abuse it too many times in one day.
#2.PUBMED
#You need install pymed libraries
#3.SCOPUS
##Note: to use scopus you must have acces first.
#You can use my APIKey, otherwise first you have to access Elsevier and create an APIKey.
#consider that in Scopus there is a maximum number of papers that you can register also

In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import quote
from datetime import datetime



# determine key words

search_terms1 = ["roots AND phenology", "belowground AND phenology","roots AND growth",'"roots production"']

# List of keywords to exclude (tittle or abstract)
exclude_keywords = [
    "animals", "human", "diseases", "insects"
]  

# Parameters
pages = 30   # Number of pages to scrape per search term
delay = 30  # Delay in seconds between page requests (try not to go below 20 seconds!)

def google_scholar_screening(search_terms, exclude_keywords, pages=3, delay=10):
    base_url = "https://scholar.google.com/scholar"
    all_results = []

    for term in search_terms: #loop for each keyword
        query = quote(term)
        print(f"Starting search for: {term}")

        for page in range(pages):
            start = page * 10
            url = f"{base_url}?q={query}&start={start}"
            print(f"Fetching: {url}")

            # Simulate browser user-agent to reduce blocking risk
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }

            try:
                response = requests.get(url, headers=headers)
                response.raise_for_status()
                webpage = BeautifulSoup(response.text, "html.parser")
            except requests.RequestException as e:
                print(f"Failed to fetch page: {url}")
                print(e)
                continue

            # Extract article details
            #NOTE: gs_ri is the broadest class in the HTML structure and within it are the other classes (gs_rt, gs_rs, gs_a, etc.)
            
            articles = webpage.select(".gs_ri")  # Contenedor de cada artículo
            if not articles:
                print("No articles found on the page. Check the HTML structure.")
                continue

            for article in articles:
                title_elem = article.select_one(".gs_rt a")
                snippet_elem = article.select_one(".gs_rs")
                pub_info_elem = article.select_one(".gs_a")
                cited_by_container = article.select_one(".gs_fl.gs_flb")  # Contenedor de citas

                # Skip if essential elements are missing
                if not title_elem or not snippet_elem or not pub_info_elem:
                    continue

                # Extract title and link
                title = title_elem.text
                link = title_elem["href"]

                # Extract abstract
                abstract = snippet_elem.text

                # Extract publication info (authors, year, journal, editorial)
                pub_info = pub_info_elem.text

                # We consider the format(pattern) to be "authors - journal, year - publisher"
                pub_info_pattern = re.compile(r"^(.*?)\s*-\s*(.*?),\s*(\d{4})\s*-\s*(.*)$")
                match = pub_info_pattern.match(pub_info)

                #Now we get each part of the pattern

                if match:
                    authors = match.group(1).strip()
                    journal = match.group(2).strip()
                    year = int(match.group(3))
                    editorial = match.group(4).strip()
                else:
                    authors = pub_info.strip()
                    journal = ""
                    year = None
                    editorial = ""

                # Extract citations
                citation_count = 0
                if cited_by_container:
                    cited_by_text = cited_by_container.get_text()
                # We search for the words "Cited by" in this part of the HTML content
                    if "Cited by" in cited_by_text or "Cité" in cited_by_text: #or in French!
                        citation_count = int(re.search(r"\d+", cited_by_text).group(0))

                # Print the citation text to verify
                print(f"Citation text for '{title}': {cited_by_text if cited_by_container else 'No citation found'}")

                # Check if keywords are excluded
                exclude_in_title = any(keyword.lower() in title.lower() for keyword in exclude_keywords)
                exclude_in_abstract = any(keyword.lower() in abstract.lower() for keyword in exclude_keywords)

                # Add to results
                if not (exclude_in_title or exclude_in_abstract):
                    all_results.append({
                        "Title": title,
                        "Link": link,
                        "Abstract": abstract,
                        "Authors": authors,
                        "Journal": journal,
                        "Year": year,
                        "Citations": citation_count,
                        "SearchTerm": term
                    })

            # Pause between requests to avoid detection
            time.sleep(delay)

    
    results = pd.DataFrame(all_results)
    return results

# Fetch results
results1 = google_scholar_screening(search_terms1, exclude_keywords, pages, delay)

# Remove duplicate rows (based on the title)
results = results1.drop_duplicates(subset="Title")

# Save results to a CSV file
results.to_csv("ScreepingGoogleScholarPC_mars.csv", index=False)

# Display the results
print(results)

Starting search for: roots AND phenology
Fetching: https://scholar.google.com/scholar?q=roots%20AND%20phenology&start=0
Failed to fetch page: https://scholar.google.com/scholar?q=roots%20AND%20phenology&start=0
429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fq%3Droots%2520AND%2520phenology%26start%3D0&q=EgSdiEk6GLP4lr4GIiwv2EU2bMOHvC-r0Yyq_tMvnPC2-MWRuKtje5Ncn1boIYdPS1h5YG8IsWw-XDIBcloBQw
Fetching: https://scholar.google.com/scholar?q=roots%20AND%20phenology&start=10
Citation text for 'Plant phenology: a critical controller of soil resource acquisition': Enregistrer Citer Cité 350 fois Autres articles Les 14 versions  
Citation text for 'The mycorrhizal status, root anatomy, and phenology of plants in a sugar maple forest': Enregistrer Citer Cité 312 fois Autres articles Les 8 versions  
Citation text for 'Root phenology unresponsive to earlier snowmelt despite advanced above‐ground phenology in two subarctic

In [57]:
import pandas as pd
from pymed import PubMed
import time

# Configuración de palabras clave y rango de años
keywords = ["roots AND phenology", "fine roots AND growth"]
start_year = 1990
end_year = 2025
max_results = 50  # Número máximo de resultados

# Función para buscar en PubMed
def search_pubmed(keywords, start_year, end_year, max_results=50):
    pubmed = PubMed(tool="PubMedSearcher", email="tu@email.com")  # Reemplaza con tu email
    query = ' AND '.join(keywords)
    results = pubmed.query(query, max_results=max_results)
    papers = []

    for result in results:
        # Obtiene la fecha de publicación
        pub_date = result.publication_date
        if pub_date:
            # Extrae el año de la fecha de publicación
            pub_year = int(pub_date[:4]) if isinstance(pub_date, str) else pub_date.year
        else:
            pub_year = None

        # Filtra por año de publicación
        if pub_year and start_year <= pub_year <= end_year:
            paper = {
                'title': result.title,
                'abstract': result.abstract if result.abstract else "Resumen no disponible",
                'keywords': ', '.join(result.keywords) if result.keywords else ', '.join(keywords),
                'authors': ', '.join([author['lastname'] + ' ' + author['initials'] for author in result.authors]) if result.authors else "Autores no disponibles",
                'journal': result.journal if result.journal else "Revista no disponible",
                'date': pub_year,
                'source': 'PubMed'
            }
            papers.append(paper)
            print(f"Publicación encontrada: {result.title}")  # Imprime cada publicación

    return papers

# Realizar búsqueda en PubMed
try:
    pubmed_papers = search_pubmed(keywords, start_year, end_year, max_results)
except Exception as e:
    print(f"Error durante la búsqueda: {e}")
    pubmed_papers = []

# Mostrar resultados
if pubmed_papers:
    df = pd.DataFrame(pubmed_papers)
    print(df)

    # Guardar la tabla en un archivo CSV
    df.to_csv('pubmed_papers.csv', index=False)
else:
    print("No se encontraron resultados o hubo un error en la búsqueda.")


Publicación encontrada: Above- and belowground phenology responses of subtropical Chinese fir (Cunninghamia lanceolata) to soil warming, precipitation exclusion and their interaction.
Publicación encontrada: Effects of warming on fine root lifespan of forests: A review.
Publicación encontrada: Divergent seasonal responses of above- and below-ground to environmental factors in alpine grassland.
Publicación encontrada: Effects of Soil Water Shortage on Seedling Shoot and Root Growth of Saragolle Lucana Tetraploid Wheat (
Publicación encontrada: Above- and belowground interplay: Canopy CO
Publicación encontrada: Separating the effects of air and soil temperature on silver birch. Part I. Does soil temperature or resource competition determine the timing of root growth?
Publicación encontrada: Eco-archaeological excavation techniques reveal snapshots of subterranean truffle growth.
Publicación encontrada: Responses of root phenology in ecotypes of Eriophorum vaginatum to transplantation and

In [59]:
import pandas as pd
from pymed import PubMed
import time

# Configuración de palabras clave y rango de años
keywords = ["roots AND phenology", "fine roots AND growth"]
start_year = 1990
end_year = 2025
max_results = 50  # Número máximo de resultados

# Función para buscar en PubMed
def search_pubmed(keywords, start_year, end_year, max_results=50):
    pubmed = PubMed(tool="PubMedSearcher", email="tu@email.com")  # Reemplaza con tu email
    query = ' AND '.join(keywords)
    results = pubmed.query(query, max_results=max_results)
    papers = []

    for result in results:
        # Obtiene la fecha de publicación
        pub_date = result.publication_date
        if pub_date:
            # Extrae el año de la fecha de publicación
            pub_year = int(pub_date[:4]) if isinstance(pub_date, str) else pub_date.year
        else:
            pub_year = None

        # Filtra por año de publicación
        if pub_year and start_year <= pub_year <= end_year:
            paper = {
                'Title': result.title,
                'Link': f"https://pubmed.ncbi.nlm.nih.gov/{result.pubmed_id}",  # Enlace al artículo
                'Abstract': result.abstract if result.abstract else "Resumen no disponible",
                'Authors': ', '.join([author['lastname'] + ' ' + author['initials'] for author in result.authors]) if result.authors else "Autores no disponibles",
                'Journal': result.journal if result.journal else "Revista no disponible",
                'Year': pub_year,
                'Citations': '',  # PubMed no proporciona citaciones directamente
                'SearchTerm': query  # Término de búsqueda utilizado
            }
            papers.append(paper)
            print(f"Publicación encontrada: {result.title}")  # Imprime cada publicación

    return papers

# Realizar búsqueda en PubMed
try:
    pubmed_papers = search_pubmed(keywords, start_year, end_year, max_results)
except Exception as e:
    print(f"Error durante la búsqueda: {e}")
    pubmed_papers = []

# Mostrar resultados
if pubmed_papers:
    df = pd.DataFrame(pubmed_papers)
    
    # Reordenar las columnas en el orden solicitado
    df = df[['Title', 'Link', 'Abstract', 'Authors', 'Journal', 'Year', 'Citations', 'SearchTerm']]
    
    print(df)

    # Guardar la tabla en un archivo CSV
    df.to_csv('pubmed_papers2.csv', index=False)
else:
    print("No se encontraron resultados o hubo un error en la búsqueda.")

Publicación encontrada: Above- and belowground phenology responses of subtropical Chinese fir (Cunninghamia lanceolata) to soil warming, precipitation exclusion and their interaction.
Publicación encontrada: Effects of warming on fine root lifespan of forests: A review.
Publicación encontrada: Divergent seasonal responses of above- and below-ground to environmental factors in alpine grassland.
Publicación encontrada: Effects of Soil Water Shortage on Seedling Shoot and Root Growth of Saragolle Lucana Tetraploid Wheat (
Publicación encontrada: Above- and belowground interplay: Canopy CO
Publicación encontrada: Separating the effects of air and soil temperature on silver birch. Part I. Does soil temperature or resource competition determine the timing of root growth?
Publicación encontrada: Eco-archaeological excavation techniques reveal snapshots of subterranean truffle growth.
Publicación encontrada: Responses of root phenology in ecotypes of Eriophorum vaginatum to transplantation and

In [63]:
import pandas as pd
from pymed import PubMed
import time

# Configuración de palabras clave y rango de años
keywords = ["roots AND phenology", "fine roots AND growth"]
start_year = 1990
end_year = 2025
max_results = 50  # Número máximo de resultados

# Función para buscar en PubMed
def search_pubmed(keywords, start_year, end_year, max_results=50):
    pubmed = PubMed(tool="PubMedSearcher", email="tu@email.com")  # Reemplaza con tu email
    query = ' AND '.join(keywords)
    results = pubmed.query(query, max_results=max_results)
    papers = []

    for result in results:
        # Obtiene la fecha de publicación
        pub_date = result.publication_date
        if pub_date:
            # Extrae el año de la fecha de publicación
            pub_year = int(pub_date[:4]) if isinstance(pub_date, str) else pub_date.year
        else:
            pub_year = None

        # Filtra por año de publicación
        if pub_year and start_year <= pub_year <= end_year:
            # Obtener el DOI o el ID de PubMed
            doi = result.doi if hasattr(result, 'doi') and result.doi else None
            link = doi if doi else result.pubmed_id  # Mostrar solo el DOI o el ID de PubMed

            paper = {
                'Title': result.title,
                'Link': link,  # Solo el DOI o el ID de PubMed
                'Abstract': result.abstract if result.abstract else "Resumen no disponible",
                'Authors': ', '.join([author['lastname'] + ' ' + author['initials'] for author in result.authors]) if result.authors else "Autores no disponibles",
                'Journal': result.journal if result.journal else "Revista no disponible",
                'Year': pub_year,
                'Citations': '',  # PubMed no proporciona citaciones directamente
                'SearchTerm': query  # Término de búsqueda utilizado
            }
            papers.append(paper)
            print(f"Publicación encontrada: {result.title}")  # Imprime cada publicación

    return papers

# Realizar búsqueda en PubMed
try:
    pubmed_papers = search_pubmed(keywords, start_year, end_year, max_results)
except Exception as e:
    print(f"Error durante la búsqueda: {e}")
    pubmed_papers = []

# Mostrar resultados
if pubmed_papers:
    df = pd.DataFrame(pubmed_papers)
    
    # Reordenar las columnas en el orden solicitado
    df = df[['Title', 'Link', 'Abstract', 'Authors', 'Journal', 'Year', 'Citations', 'SearchTerm']]
    
    print(df)

    # Guardar la tabla en un archivo CSV
    df.to_csv('pubmed_papers4.csv', index=False)
else:
    print("No se encontraron resultados o hubo un error en la búsqueda.")

Publicación encontrada: Above- and belowground phenology responses of subtropical Chinese fir (Cunninghamia lanceolata) to soil warming, precipitation exclusion and their interaction.
Publicación encontrada: Effects of warming on fine root lifespan of forests: A review.
Publicación encontrada: Divergent seasonal responses of above- and below-ground to environmental factors in alpine grassland.
Publicación encontrada: Effects of Soil Water Shortage on Seedling Shoot and Root Growth of Saragolle Lucana Tetraploid Wheat (
Publicación encontrada: Above- and belowground interplay: Canopy CO
Publicación encontrada: Separating the effects of air and soil temperature on silver birch. Part I. Does soil temperature or resource competition determine the timing of root growth?
Publicación encontrada: Eco-archaeological excavation techniques reveal snapshots of subterranean truffle growth.
Publicación encontrada: Responses of root phenology in ecotypes of Eriophorum vaginatum to transplantation and

In [75]:
import pandas as pd
from pymed import PubMed
import time

# Configuración de palabras clave y rango de años
keywords = ["roots AND phenology", "fine roots AND growth"]
start_year = 1990
end_year = 2025
max_results = 50  # Número máximo de resultados

# Función para buscar en PubMed
def search_pubmed(keywords, start_year, end_year, max_results=50):
    pubmed = PubMed(tool="PubMedSearcher", email="tu@email.com")  # Reemplaza con tu email
    query = ' AND '.join(keywords)
    results = pubmed.query(query, max_results=max_results)
    papers = []

    for result in results:
        # Obtiene la fecha de publicación
        pub_date = result.publication_date
        if pub_date:
            # Extrae el año de la fecha de publicación
            pub_year = int(pub_date[:4]) if isinstance(pub_date, str) else pub_date.year
        else:
            pub_year = None

        # Filtra por año de publicación
        if pub_year and start_year <= pub_year <= end_year:
            # Obtener el DOI (solo el primero si es una lista)
            doi = result.doi if hasattr(result, 'doi') and result.doi else None
            if isinstance(doi, list):
                doi = doi[0]  # Tomar solo el primer DOI

            # Obtener el ID de PubMed (solo el primero si es una lista)
            pubmed_id = result.pubmed_id if hasattr(result, 'pubmed_id') else None
            if isinstance(pubmed_id, list):
                pubmed_id = pubmed_id[0]  # Tomar solo el primer ID

            # Seleccionar el enlace (prioridad al DOI)
            if doi:
                link = f"https://doi.org/{doi}"
            elif pubmed_id:
                link = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}"
            else:
                link = ""  # Dejar vacío si no hay DOI ni PubMed ID

            paper = {
                'Title': result.title,
                'Link': link,  # Solo un enlace por artículo
                'Abstract': result.abstract if result.abstract else "Resumen no disponible",
                'Authors': ', '.join([author['lastname'] + ' ' + author['initials'] for author in result.authors]) if result.authors else "Autores no disponibles",
                'Journal': result.journal if result.journal else "Revista no disponible",
                'Year': pub_year,
                'Citations': '',  # PubMed no proporciona citaciones directamente
                'SearchTerm': query  # Término de búsqueda utilizado
            }
            papers.append(paper)
            print(f"Publicación encontrada: {result.title}")  # Imprime cada publicación

    return papers

# Realizar búsqueda en PubMed
try:
    pubmed_papers = search_pubmed(keywords, start_year, end_year, max_results)
except Exception as e:
    print(f"Error durante la búsqueda: {e}")
    pubmed_papers = []

# Mostrar resultados
if pubmed_papers:
    df = pd.DataFrame(pubmed_papers)
    
    # Reordenar las columnas en el orden solicitado
    df = df[['Title', 'Link', 'Abstract', 'Authors', 'Journal', 'Year', 'Citations', 'SearchTerm']]
    
    print(df)

    # Guardar la tabla en un archivo CSV
    df.to_csv('pubmed_papers9.csv', index=False)
else:
    print("No se encontraron resultados o hubo un error en la búsqueda.")

Publicación encontrada: Above- and belowground phenology responses of subtropical Chinese fir (Cunninghamia lanceolata) to soil warming, precipitation exclusion and their interaction.
Publicación encontrada: Effects of warming on fine root lifespan of forests: A review.
Publicación encontrada: Divergent seasonal responses of above- and below-ground to environmental factors in alpine grassland.
Publicación encontrada: Effects of Soil Water Shortage on Seedling Shoot and Root Growth of Saragolle Lucana Tetraploid Wheat (
Publicación encontrada: Above- and belowground interplay: Canopy CO
Publicación encontrada: Separating the effects of air and soil temperature on silver birch. Part I. Does soil temperature or resource competition determine the timing of root growth?
Publicación encontrada: Eco-archaeological excavation techniques reveal snapshots of subterranean truffle growth.
Publicación encontrada: Responses of root phenology in ecotypes of Eriophorum vaginatum to transplantation and

In [55]:
import requests
import pandas as pd

# API  Scopus
api_key = '9dd60b7a5ea00b072d913b64dc700aaa'  
base_url = 'https://api.elsevier.com/content/search/scopus'


keywords = ["roots AND phenology", "fine roots AND growth"]
start_year = 1990
end_year = 2025
query = ' AND '.join(keywords)

# Parameters
params = {
    'query': f'TITLE-ABS-KEY({query}) AND PUBYEAR > {start_year} AND PUBYEAR < {end_year}',
    'apiKey': api_key,
    'count': 25,  
    'start': 0     
}

def search_scopus(params, max_results=300):
    papers = []
    while len(papers) < max_results:
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: {response.status_code} - {response.text}")
        
        data = response.json()
        results = data.get('search-results', {}).get('entry', [])
        
        for result in results:
            # 
            doi = result.get('prism:doi', '')
            link = f'https://doi.org/{doi}' if doi else result.get('prism:url', '')  
            paper = {
                'Title': result.get('dc:title', ''),
                'Link': link,  
                'Abstract': result.get('dc:description', ''),  
                'Authors': result.get('dc:creator', ''),
                'Journal': result.get('prism:publicationName', ''),
                'Year': result.get('prism:coverDate', '')[:4],  
                'Citations': result.get('citedby-count', 0),  
                'SearchTerm': query  
            }
            papers.append(paper)
        
        
        if len(results) < params['count']:
            break
        
        
        params['start'] += params['count']
    
    return papers[:max_results]  #limits


scopus_papers = search_scopus(params, max_results=200)


df_scopus = pd.DataFrame(scopus_papers)

df_scopus = df_scopus[['Title', 'Link', 'Abstract', 'Authors', 'Journal', 'Year', 'Citations', 'SearchTerm']]


print(df_scopus)

# Save results
df_scopus.to_csv('scopus_papers_4.csv', index=False)

                                                 Title  \
0    In situ soil imaging, a tool for monitoring th...   
1    Above- and belowground phenology responses of ...   
2    Monitoring Grassland Variation in a Typical Ar...   
3    A systematic review of studies on fine and coa...   
4    Effects of warming on fine root phenology of f...   
..                                                 ...   
117  The effect of nitrogen fertilization on the ph...   
118  Root proliferation characteristics of seven pe...   
119  Fine root growth phenology, production, and tu...   
120  Environmental constraints on the structure and...   
121  The Dutch Acidification Systems (DAS) model: T...   

                                                  Link Abstract  \
0           https://doi.org/10.1007/s00374-024-01851-8            
1      https://doi.org/10.1016/j.scitotenv.2024.173147            
2                   https://doi.org/10.3390/rs16071222            
3            https://doi.org/10.338