In [1]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed
import re
from tqdm import tqdm

## Get the list of all teachers

In [2]:
url = "https://www.sjsu.edu/people/"
# Fetch the content of the webpage
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Regular expression pattern to match href="/people/..."
pattern = re.compile(r'^/people/.*')

# Find all 'a' tags with href attribute matching the pattern
people_hrefs = []
for a_tag in soup.find_all('a', href=pattern):
    people_hrefs.append(a_tag['href'])

In [3]:
len(people_hrefs)

1435

In [4]:
[x for x in people_hrefs if len(x.split('/')) < 4]

['/people/',
 '/people/Fatemeh.Davoudi',
 '/people/denise.dawkins',
 '/people/aharon.degrassi',
 '/people/jane.dodge',
 '/people/david.dodick',
 '/people/isarin.durongkadej',
 '/people/xiaojia.hou',
 '/people/gaojian.huang',
 '/people/yinghua.huang',
 '/people/adam.svec',
 '/people/gabrielajohn.swamy',
 '/people/neil.switz',
 '/people/wendy.thompsontaiwo',
 '/people/chingching.tan',
 '/people/ravindra.thadani',
 '/people/megan.thiele',
 '/people/babu.thomas',
 '/people/glenn.thomas']

In [5]:
people_hrefs.remove('/people/')

In [6]:
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def get_first_teacher_result(text):
    url = 'https://www.ratemyprofessors.com/graphql'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Basic dGVzdDp0ZXN0',
    }
    
    query = """
    query NewSearchTeachersQuery($query: TeacherSearchQuery!, $count: Int) {
      newSearch {
        teachers(query: $query, first: $count) {
          edges {
            node {
              legacyId
              firstName
              lastName
              avgRating
              numRatings
              school {
                legacyId
                name
              }
            }
          }
        }
      }
    }
    """
    
    variables = {
        "query": {
            "text": text,
            "schoolID": "U2Nob29sLTg4MQ=="
        },
        "count": 1  # We're only interested in the first result
    }
    
    response = requests.post(url, headers=headers, json={"query": query, "variables": variables})
    
    if response.status_code == 200:
        data = response.json()
        edges = data.get('data', {}).get('newSearch', {}).get('teachers', {}).get('edges', [])
        if edges:
            teacher = edges[0]['node']
            return {
                'legacyId': teacher['legacyId'],
                'firstName': teacher['firstName'],
                'lastName': teacher['lastName'],
                'avgRating': teacher['avgRating'],
                'numRatings': teacher['numRatings'],
                'schoolName': teacher['school']['name']
            }
        else:
            return None
    else:
        response.raise_for_status()

In [7]:
people_hrefs[0]

'/people/deborah.abbott/index.html'

In [25]:
ok_teachers = {}
err_teachers = []
for href in tqdm(people_hrefs):
    name = href.split('/')[2].replace('.', ' ')
    response = get_first_teacher_result(name)
    if (response 
        and response['schoolName'] == 'San Jose State University'
        and response['firstName'].lower() in name
        and response['lastName'].lower() in name
    ):
        ok_teachers[name] = response
    else:
        err_teachers.append(name)
    
    

100%|██████████| 1434/1434 [08:03<00:00,  2.96it/s]


In [29]:
err_teachers

['katherine abriam-yago',
 'juana acrivos',
 'richard adams',
 'shashi agarwal',
 'craig alimo',
 'david anastasiu',
 'david andersen',
 'david anderson',
 'mary anderson',
 'gale antokal',
 'yolanda anyon',
 'john avila',
 'hee bae',
 'cecelia baldwin',
 'thomas balgooyen',
 'anu basu',
 'karen bawel',
 'thomas beggs',
 'kassahun betre',
 'sheila bienenfeld',
 'martin billik',
 'kathryn blackmerreyes',
 'jacky bloom',
 'erica boas',
 'carolus boekema',
 'ruxandra boul',
 'minan boyd',
 'nakiye boyacigiller',
 'noelle brada-williams',
 'mary brannen',
 'stephen branz',
 'matthew breaux',
 'alison bridger',
 'david bruck',
 'luann budd',
 'cathy buell',
 'anji buckner',
 'theodore butryn',
 'alice butzlaff',
 'luis camilli',
 'akilah carter-francique',
 'jessica castillo-vardaro',
 'bem cayco',
 'diana centeno',
 'david chai',
 'emily chan',
 'helene chan',
 'prabha chandrasekar',
 'natalie chia',
 'sen chiao',
 'jang-hyung cho',
 'soo choi',
 'richard chung',
 'edward cohen',
 'fred co

In [30]:
ok_teachers

{'deborah abbott': {'legacyId': 879795,
  'firstName': 'Deborah',
  'lastName': 'Abbott',
  'avgRating': 4.8,
  'numRatings': 78,
  'schoolName': 'San Jose State University'},
 'jeremy abrams': {'legacyId': 2196416,
  'firstName': 'Jeremy',
  'lastName': 'Abrams',
  'avgRating': 4.3,
  'numRatings': 35,
  'schoolName': 'San Jose State University'},
 'joel abrams': {'legacyId': 1728996,
  'firstName': 'Joel',
  'lastName': 'Abrams',
  'avgRating': 4.8,
  'numRatings': 13,
  'schoolName': 'San Jose State University'},
 'tzvia abramson': {'legacyId': 2157796,
  'firstName': 'Tzvia',
  'lastName': 'Abramson',
  'avgRating': 2.4,
  'numRatings': 10,
  'schoolName': 'San Jose State University'},
 'sina aboutorabi': {'legacyId': 2861622,
  'firstName': 'Sina',
  'lastName': 'Aboutorabi',
  'avgRating': 1.9,
  'numRatings': 6,
  'schoolName': 'San Jose State University'},
 'georgia acker': {'legacyId': 954102,
  'firstName': 'Georgia',
  'lastName': 'Acker',
  'avgRating': 4.9,
  'numRatings':