# ScholarshipScout

This notebook implements the ScholarshipScout project, an Multi-Agent AI based web scraper to search for fully funded doctorate positions from popular academic position websites.

## Step 1: Install Dependencies

First, let's install the required packages.


In [None]:
!pip install rich http3 httpx pandas docopt openpyxl brotlipy bs4 urllib3
!pip install --upgrade httpx httpcore
!pip install nest_asyncio
!pip install python-dotenv

## Step 2: Import Libraries and Define Constants

Now, let's import the necessary libraries and define some constants.


In [None]:

import re
import sys
import pandas as pd
import asyncio
import rich
from rich.console import Console
from datetime import date
from dataclasses import dataclass
from bs4 import BeautifulSoup as bs
from pathlib import Path
import json
import httpx

console = Console()

__version__ = "0.4.7"
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


## Step 3: Define Config Class

This class holds configuration details for different repositories.


In [None]:
@dataclass
class Config:
    config = {
        'scholarshipdb': {
            'sought#': 'h1.title',
            'query': 'https://scholarshipdb.net/scholarships/Program-PhD?page={page}&q={fields}',
            'title': 'h4 a',
            'country': '.list-unstyled a.text-success',
            'date': '.list-unstyled span.text-muted',
            'link': ".list-unstyled h4 a",
        },
        'findaphd': {
            'sought#': 'h4.course-count.d-none.d-md-block.h6.mb-0.mt-1',
            'query': 'https://www.findaphd.com/phds/non-eu-students/?01w0&Keywords={fields}&PG={page}',
            'title': "h4 text-dark mx-0 mb-3",
            'country': "country-flag img-responsive phd-result__dept-inst--country-icon",
            'date': "apply py-2 small",
            'link': "h4 text-dark mx-0 mb-3",
        },
    }

    def __init__(self, repo='scholarshipdb'):
        self.repo = repo

    @classmethod
    def repos(cls):
        return ','.join(list(cls.config))

    @property
    def query(self):
        return Config.config[self.repo]['query']

    @property
    def sought(self):
        return Config.config[self.repo]['sought#']

    @property
    def title(self):
        return Config.config[self.repo]['title']

    @property
    def link(self):
        return Config.config[self.repo]['link']

    @property
    def country(self):
        return Config.config[self.repo]['country']

    @property
    def date(self):
        return Config.config[self.repo]['date']

    @property
    def baseURL(self):
        return next(re.finditer(r'^.+?[^\/:](?=[?\/]|$)', Config.config[self.repo]['query'])).group()


## Step 4: Define PhDSeeker Class

This is the main class that handles the scraping and data processing.


In [None]:
class PhDSeeker:
    def __init__(self, keywords: str, repos: str = 'scholarshipdb, findaphd', maxpage: int = 10, desired_countries: str = None):
        self.repos = list(map(str.strip, repos.split(',')))
        self.keywords = keywords
        self.fields = '%20'.join([f"\"{item.replace(' ', '%20')}\"" for item in map(str.strip, keywords.split(','))])
        self.desired_countries = [country.strip().lower() for country in desired_countries.split(",")] if desired_countries else []
        self.titles = []
        self.countries = []
        self.dates = []
        self.links = []
        self.maxpage = maxpage
        self.file_name = f"PhD_Positions_{date.today()}[{keywords}]"
        self.df = None
        self.sought_number = 0
        self.found_number = 0

    async def __get_page__(self, repo, page):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        }
        c = Config(repo)
        try:
            query = c.query.format(fields=self.fields, page=page)
            async with httpx.AsyncClient() as client:
                response = await client.get(query, headers=headers, follow_redirects=True)
            soup = bs(response.text, "html.parser")
            if page == 1:
                if (n := soup.select_one(c.sought)) is not None:
                    foundPositions = int(re.search('(\d+[,\d*]*)', n.text)[1].replace(',',''))
                    self.sought_number += foundPositions
                print(f"\r>> {foundPositions} positions found <<".center(console.width))
            titles = soup.select(c.title) if repo == 'scholarshipdb' else soup.find_all(class_=c.title)
            countries = soup.select(c.country) if repo == 'scholarshipdb' else soup.find_all(class_=c.country)
            dates = soup.select(c.date) if repo == 'scholarshipdb' else soup.find_all(class_=c.date)
            links = soup.select(c.link) if repo == 'scholarshipdb' else soup.find_all(class_=c.link)

            for title, country, date, link in zip(titles, countries, dates, links):
                found_country = country.text if repo == 'scholarshipdb' else country.get('title', '')
                save_position = not self.desired_countries or any(dc in found_country.lower() for dc in self.desired_countries)
                if save_position:
                    self.countries.append(found_country)
                    self.titles.append(title.text.strip())
                    self.dates.append(date.text.replace('\n', '').strip())
                    self.links.append(c.baseURL + link['href'])
                    self.found_number += 1
            print(f"\rPage {page} has been fetched from {c.baseURL}! Found {len(titles)} positions, saved {self.found_number}.")
            return True
        except Exception as e:
            print(f"Error in {repo} page {page}: {e}")
            return False

    async def prepare(self):
        for repo in self.repos:
            print(f"\r{('::[ '+repo+' ]::').center(console.width, '=')}")
            tasks = [asyncio.create_task(self.__get_page__(repo, page)) for page in range(1, self.maxpage+1)]
            await asyncio.gather(*tasks)
        print(f"Total positions found: {self.sought_number}")
        print(f"Positions saved after filtering: {self.found_number}")

    @property
    def positions(self):
        if self.df is None:
            if self.titles:
                positions = {
                    "Country": self.countries,
                    "Date": self.dates,
                    "Title": self.titles,
                    "Link": self.links
                }
                self.df = pd.DataFrame.from_dict(positions)
                self.df['timedelta'] = self.df["Date"].apply(lambda x: re.sub('about|ago', '', x).strip())
                name2days ={'minutes':'*1', 'hours':'*60', 'hour':'*60', 'days':'*1440', 'day':'*1440',
                            'weeks':'*10080', 'week':'*10080', 'months':'*43200', 'month':'*43200'}
                self.df.replace({'timedelta': name2days }, regex=True, inplace=True)
                self.df['timedelta'] = self.df['timedelta'].apply(lambda x: eval(x) if x else x)
                self.df.sort_values(by=['Country', 'timedelta', 'Title'], inplace=True)
                self.df = self.df.drop('timedelta', axis=1).reset_index(drop=True)
            else:
                print("No data was scraped. Creating an empty DataFrame.")
                self.df = pd.DataFrame(columns=["Country", "Date", "Title", "Link"])
        return self.df

    def save(self, output='both'):
        df = self.positions
        if len(df) > 0:
            s  = 's' if output=='both' else ''
            print(f"\r{console.width*' '}\n>>>> {self.sought_number} positions have been found in total.",
            f"Specifically, {len(df)} records of them have been saved in the following file{s}:" , sep='\n')
            if output in ('csv', 'both'):
                df.to_csv(f'{self.file_name}.csv', index=False)
                rich.print(f'[blue]{self.file_name}.csv saved![/blue]')
            if output in ('xlsx', 'both'):
                df.to_excel(f'{self.file_name}.xlsx', index=False)
                rich.print(f'[blue]{self.file_name}.xlsx saved![/blue]')
        else:
            rich.print('[red blink] >>> No positions found, change your keyword or countries. <<< [/red blink]')


## Step 5: Main Function & Export Links as JSON


In [None]:
import nest_asyncio
nest_asyncio.apply()

async def main(keywords, maxpage=10, countries=None, output='both'):
    print(f"Searching for keywords: {keywords}")
    print(f"Desired countries: {countries}")
    ps = PhDSeeker(keywords, maxpage=maxpage, desired_countries=countries)
    await ps.prepare()
    print(f"Titles found: {len(ps.titles)}")
    print(f"Countries found: {len(ps.countries)}")
    print(f"Dates found: {len(ps.dates)}")
    print(f"Links found: {len(ps.links)}")
    ps.save(output)
    return ps.positions

# Example usage
keywords = 'Computer Science'
maxpage = 2
countries = 'United States, Belgium, Netherlands, Germany, Switzerland'
output = 'both'

df = asyncio.get_event_loop().run_until_complete(main(keywords, maxpage, countries, output))
print(f"DataFrame shape: {df.shape}")
print(f"DataFrame columns: {df.columns}")
print(f"First few rows:")
display(df.head())


In [None]:
links = df['Link'].tolist()

links_dict = {"links": links}

with open('PhD_Programs_Links.json', 'w') as f:
    json.dump(links_dict, f, indent=2)

print("The links have been saved to PhD_Programs_Links.json")


## Step 6: Setup the AutoGen Environment and define utility functions:

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from typing import List, Dict, Union
import autogen

# Utility functions
def count_links(json_file: str) -> Dict[str, Union[int, List[str]]]:
    with open(json_file, 'r') as f:
        data = json.load(f)
    return {"count": len(data['links']), "links": data['links']}


def fetch_job_description(job_link: str) -> str:
    response = requests.get(job_link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        job_description = soup.get_text()
        return job_description
    else:
        raise ValueError(f"Failed to fetch the job description. HTTP status code: {response.status_code}")

def extract_info(job_description: str) -> Dict[str, str]:
    # This function would use GPT to extract the required information
    # For now, we'll return a placeholder
    return {
        "University Name": "",
        "Program Name": "",
        "International Students' Deadline for Applying": "",
        "Start Date": "",
        "Country": "",
        "Name of Supervisor": "",
        "Funding": "",
        "Link to Program's Page": "",
        "Short Description of Project": ""
    }

def write_to_csv(data: List[Dict[str, str]], filename: str) -> None:
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)


## Step 7: Define Agents:

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv('API_KEY')
import autogen

# Access the API key
api_key = os.getenv('API_KEY')

config_list = [
    {
        "model": "gpt-4",
        "api_key": api_key
    }
]
llm_config = {
    "cache_seed": 42,
    "temperature": 0,
    "config_list": config_list,
    "timeout": 120,
}

def termination_msg(x):
    return isinstance(x, dict) and "TERMINATE" == str(x.get("content", ""))[-9:].upper()

# User Proxy Agent
user_proxy = autogen.UserProxyAgent(
    name="UserProxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=termination_msg,
    code_execution_config={"work_dir": "phd_scraper"},
    llm_config=llm_config,
)

# Link Counter Agent
link_counter = autogen.AssistantAgent(
    name="LinkCounter",
    llm_config=llm_config,
    system_message="You are responsible for counting and retrieving the links to be processed. After counting, you should pass the links to the WebScraper for processing."
)

web_scraper = autogen.AssistantAgent(
    name="WebScraper",
    llm_config=llm_config,
    system_message="You are responsible for visiting each link, extracting the required information, and passing the compiled information to the CSVWriter."
)


# CSV Writer Agent
csv_writer = autogen.AssistantAgent(
    name="CSVWriter",
    llm_config=llm_config,
    system_message="You are responsible for writing the extracted information to a CSV file."
)

# Register functions
autogen.register_function(
    count_links,
    caller=link_counter,
    executor=user_proxy,
    name="count_links",
    description="Count the number of links in a JSON file.",
)

autogen.register_function(
    fetch_job_description,
    caller=web_scraper,
    executor=user_proxy,
    name="fetch_job_description",
    description="Fetch job description from a given URL.",
)

autogen.register_function(
    extract_info,
    caller=web_scraper,
    executor=user_proxy,
    name="extract_info",
    description="Extract PhD program information from job description.",
)

autogen.register_function(
    write_to_csv,
    caller=csv_writer,
    executor=user_proxy,
    name="write_to_csv",
    description="Write extracted information to a CSV file.",
)


## Step 8: Initiate group chat:

In [None]:
group_chat = autogen.GroupChat(
    agents=[user_proxy, link_counter, web_scraper, csv_writer],
    messages=[],
    speaker_selection_method="auto",
    max_round=15
)

manager = autogen.GroupChatManager(groupchat=group_chat, llm_config=llm_config)


## Step 9: Initiate the process

In [None]:
def process_phd_links(json_file: str) -> None:
    initial_message = f"""
    Please process the PhD program links in the file: {json_file}
    1. Use the count_links function to get the total number of links and the actual links
    2. For each link returned by count_links:
       a. Use the fetch_job_description function to get the job description
       b. Use the extract_info function to extract the required information
    3. After processing all links, use the write_to_csv function to save the information
    Once all links are processed, reply with 'TERMINATE'.
    """
    user_proxy.initiate_chat(manager, message=initial_message)

# Run the process
process_phd_links("PhD_Programs_Links.json")
