In [None]:
pip install selenium

In [None]:
pip install requests

In [None]:
import csv
import re
from bs4 import BeautifulSoup
from selenium import webdriver

In [None]:

def scrape_google_scholar_author_selenium(url):
    
   # Configure Chrome to run in headless mode
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    # Initialize a Chrome WebDriver instance
    driver = webdriver.Chrome(options=options)

    try:
        # Navigate to the specified URL
        driver.get(url)
        # Wait for elements to be present before accessing them
        driver.implicitly_wait(10)
        # Get the HTML content of the page
        html_content = driver.page_source
        return html_content

    finally:
        # Close the Chrome browser instance
        driver.quit()


In [None]:

def get_author_profile(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract basic author information
        author_results = {
            'name': soup.select_one("#gsc_prf_in") and soup.select_one("#gsc_prf_in").text.strip(),
            'position': soup.select_one("#gsc_prf_inw+ .gsc_prf_il") and soup.select_one("#gsc_prf_inw+ .gsc_prf_il").text.strip(),
            'email': soup.select_one("#gsc_prf_ivh") and soup.select_one("#gsc_prf_ivh").text.strip(),
            'departments': soup.select_one("#gsc_prf_int") and soup.select_one("#gsc_prf_int").text.strip()
        }

        # Extract h-index, i-index, and citation information
        h_index = {
            'all': soup.select_one("tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std").text,
            'since_2018': soup.select_one("tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std").text
        }

        i_index = {
            'all': soup.select_one("tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std").text,
            'since_2018': soup.select_one("tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std").text
        }

        citations = {
            'all': soup.select_one("tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std").text,
            'since_2018': soup.select_one("tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std").text
        }

        author_results.update({'h_index': h_index, 'i_index': i_index, 'citations': citations})

        # Extract co-authors information
        co_authors = []
        for result in soup.select('.gsc_rsb_aa'):
            co_authors.append({
                'name': result.select_one('.gsc_rsb_a_desc a').text.strip(),
                'title': result.select_one('.gsc_rsb_a_ext').text.strip(),
                'link': f"https://scholar.google.com{result.select_one('.gsc_rsb_a_desc a')['href']}",
                'email': result.select_one('.gsc_rsb_a_ext.gsc_rsb_a_ext2').text.strip(),
                'thumbnail': f"https://scholar.googleusercontent.com/citations?view_op=view_photo&user={re.search(r'user=(.*)&', result.select_one('.gsc_rsb_a_desc a')['href']).group(1)}"
            })

        author_results['co_authors'] = co_authors

        return author_results

    except Exception as e:
        print(f"Error extracting author profile: {e}")
        return {}



In [None]:
def scrape_and_save_author_info(author_url, writer_author, writer_coauthors):
    # Scrape HTML content using the first function
    html_content = scrape_google_scholar_author_selenium(author_url)

    # Extract author profile using the updated function
    author_profile = get_author_profile(html_content)

    # Write author information to the author CSV file
    writer_author.writerow({
        'Name': author_profile.get('name', ''),
        'Position': author_profile.get('position', ''),
        'Email': author_profile.get('email', ''),
        'Departments': author_profile.get('departments', ''),
        'H-Index (All)': author_profile['h_index'].get('all', ''),
        'H-Index (Since 2018)': author_profile['h_index'].get('since_2018', ''),
        'i-Index (All)': author_profile['i_index'].get('all', ''),
        'i-Index (Since 2018)': author_profile['i_index'].get('since_2018', ''),
        'Citations (All)': author_profile['citations'].get('all', ''),
        'Citations (Since 2018)': author_profile['citations'].get('since_2018', '')
    })

    # Write co-authors information to the co-authors CSV file
    writer_coauthors.writerow({'Main Author': author_profile.get('name', '')})

    for co_author in author_profile.get('co_authors', []):
        writer_coauthors.writerow({
            'Main Author': author_profile.get('name', ''),
            'Name': co_author.get('name', ''),
            'Title': co_author.get('title', ''),
            'Link': co_author.get('link', ''),
            'Email': co_author.get('email', ''),
            'Thumbnail': co_author.get('thumbnail', '')
        })

    print(f"Author information and co-authors information saved for {author_profile.get('name', '')}")


In [None]:
def extract_articles(soup):
    try:
        # Initialize an empty list to store article information
        articles = []

        # Iterate over each article element in the HTML
        for el in soup.select("#gsc_a_b .gsc_a_t"):
            # Extract article details and create a dictionary
            article = {
                'title': el.select_one(".gsc_a_at").text,  # Extract article title
                'link': "https://scholar.google.com" + el.select_one(".gsc_a_at")['href'],  # Construct full article link
                'authors': el.select_one(".gsc_a_at+ .gs_gray").text,  # Extract authors
                'publication': el.select_one(".gs_gray+ .gs_gray").text  # Extract publication details
            }
            articles.append(article)

        # Clean up articles data by removing empty values from each article dictionary
        articles = [{k: v for k, v in article.items() if v and v != ""} for article in articles]

        # Count the number of articles
        num_articles = len(articles)

        # Return both the list of articles and the number of articles
        return articles, num_articles

    # Handle exceptions (e.g., if the HTML structure changes or the request fails)
    except Exception as e:
        # Print the exception message for debugging purposes
        print(e)

        # Return an empty list and 0 in case of an error
        return [], 0


In [None]:

def extract_info_from_html(url):
    # Specify user-agent in the request headers to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
    }

    # Make an HTTP GET request to the specified URL with headers
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Retrieve the HTML content from the response
        html_content = response.text

        # Use BeautifulSoup to parse the HTML code
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the element that contains the profile information
        profile_element = soup.find('div', {'id': 'gsc_oci_table'})

        # Check if the profile element is found
        if profile_element is not None:
            # Extracted information will be stored in this dictionary
            profile = {}

            # List of fields to extract from the profile
            fields_to_extract = ["Auteurs", "Date de publication", "Revue", "Volume", "Numéro", "Pages", "Nombre total de citations"]

            # Loop through the fields to extract
            for field in fields_to_extract:
                # Find the field element using its class and text content
                field_element = profile_element.find('div', {'class': 'gsc_oci_field'}, text=field)

                # Check if the field element is found
                if field_element:
                    # Find the next element with the class 'gsc_oci_value' to get the value
                    value_element = field_element.find_next('div', {'class': 'gsc_oci_value'})

                    # Store the value in the profile dictionary, stripping any leading/trailing whitespace
                    profile[field] = value_element.text.strip() if value_element else ""

            return profile
        else:
            print("Profil non trouvé.")
            return None
    else:
        # Print an error message if the request was not successful
        print(f"Erreur lors de la récupération de la page. Code de statut : {response.status_code}")
        return None

In [None]:

def extract_articles_info_from_links(article_links):
    # Initialize an empty list to store articles data
    all_articles_data = []

    # Iterate through each article link
    for link in article_links:
        # Call the extract_info_from_html function for each article link
        article_info = extract_info_from_html(link)

        if article_info:
            # Extract only the numeric part from the "Nombre total de citations" field
            citations = article_info.get('Nombre total de citations', '')
            numeric_citations = re.search(r'\d+', citations).group() if citations else ""

            # Add the numeric citations back to the article_info dictionary
            article_info['Nombre total de citations'] = numeric_citations

            # Append the extracted information to the list
            all_articles_data.append(article_info)
    # Return the list of articles data
    return all_articles_data

In [None]:
import pandas as pd

# Specify the path to your Excel file
excel_file_path = "C:\\Users\\lenovo\\Downloads\\resultat.xlsx"

# Load the Excel file into a pandas DataFrame
df = pd.read_excel(excel_file_path)

# Display the first 20 rows of the DataFrame
df1 = df.head(38)

# Display the contents of the first 20 rows
df1


In [None]:
for index, row in df1.iterrows():
    author_url = row['bio'] 
    # Scrape HTML content using the first function
    html_content = scrape_google_scholar_author_selenium(author_url)

    # Extract author profile using the updated function
    author_profile = get_author_profile(html_content)

    # Create a CSV file for author information
    csv_file_path_author = 'author_data.csv'
    with open(csv_file_path_author, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames_author = ['Name', 'Position', 'Email', 'Departments', 'H-Index (All)', 'H-Index (Since 2018)', 'i-Index (All)', 'i-Index (Since 2018)', 'Citations (All)', 'Citations (Since 2018)']
        writer_author = csv.DictWriter(csvfile, fieldnames=fieldnames_author)

        # Write the header
        writer_author.writeheader()

        # Write the author information
        writer_author.writerow({
            'Name': author_profile.get('name', ''),
            'Position': author_profile.get('position', ''),
            'Email': author_profile.get('email', ''),
            'Departments': author_profile.get('departments', ''),
            'H-Index (All)': author_profile['h_index'].get('all', ''),
            'H-Index (Since 2018)': author_profile['h_index'].get('since_2018', ''),
            'i-Index (All)': author_profile['i_index'].get('all', ''),
            'i-Index (Since 2018)': author_profile['i_index'].get('since_2018', ''),
            'Citations (All)': author_profile['citations'].get('all', ''),
            'Citations (Since 2018)': author_profile['citations'].get('since_2018', '')
        })

    print(f"Author profile saved to {csv_file_path_author}")

    # Create a CSV file for co-authors
    csv_file_path_coauthors = 'coauthors.csv'
    with open(csv_file_path_coauthors, 'w', newline='', encoding='utf-8') as csvfile_coauthors:
        fieldnames_coauthors = ['Main Author', 'Name', 'Title', 'Link', 'Email', 'Thumbnail']
        writer_coauthors = csv.DictWriter(csvfile_coauthors, fieldnames=fieldnames_coauthors)

        # Write the header
        writer_coauthors.writeheader()

        # Write the main author's name as the first row
        writer_coauthors.writerow({'Main Author': author_profile.get('name', '')})

        # Write the co-authors information
        for co_author in author_profile.get('co_authors', []):
            writer_coauthors.writerow({
                'Main Author': author_profile.get('name', ''),
                'Name': co_author.get('name', ''),
                'Title': co_author.get('title', ''),
                'Link': co_author.get('link', ''),
                'Email': co_author.get('email', ''),
                'Thumbnail': co_author.get('thumbnail', '')
            })

    print(f"Co-authors information saved to {csv_file_path_coauthors}")


In [None]:
for index, row in df1.iterrows():
    author_url = row['bio']
   
    # Scrape HTML content of the author's profile using the first function
    html_content_author = scrape_google_scholar_author_selenium(author_url)

    # Extract the name of the main author
    soup_author = BeautifulSoup(html_content_author, 'html.parser')
    main_author_name = soup_author.select_one("#gsc_prf_in") and soup_author.select_one("#gsc_prf_in").text.strip()

    # Call the extract_articles function for the author's profile page
    articles_info, num_articles = extract_articles(soup_author)

    # If there are articles, extract additional information from each article link
    if num_articles > 0:
        # Extract article links
        article_links = [article['link'] for article in articles_info]

        # Call the extract_articles_info_from_links function
        all_articles_data = extract_articles_info_from_links(article_links)

        # Set the fieldnames dynamically based on the keys found in all_articles_data
        fieldnames_articles = ["Main Author"] + list(set(key for article_info in all_articles_data for key in article_info.keys()))

        # Save the information to a CSV file
        csv_file_path_articles = 'articles_info.csv'
        with open(csv_file_path_articles, 'a', newline='', encoding='utf-8') as csvfile_articles:
            writer_articles = csv.DictWriter(csvfile_articles, fieldnames=fieldnames_articles)

            # If the file is empty, write the header
            if csvfile_articles.tell() == 0:
                writer_articles.writeheader()

            # Write the article information
            for article_data in all_articles_data:
                # Add the main author's name to each row
                article_data['Main Author'] = main_author_name

                # Print keys for debugging
                print("Keys in article_data:", article_data.keys())
                print("Fieldnames:", fieldnames_articles)

                # Write the row to the CSV file
                writer_articles.writerow(article_data)

        print(f"Articles information appended to {csv_file_path_articles}")
