In [52]:
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed
import re
from tqdm import tqdm
import json
import base64
import csv

## Get the list of all teachers

In [2]:
url = "https://www.sjsu.edu/people/"
# Fetch the content of the webpage
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Regular expression pattern to match href="/people/..."
pattern = re.compile(r'^/people/.*')

# Find all 'a' tags with href attribute matching the pattern
people_hrefs = []
for a_tag in soup.find_all('a', href=pattern):
    people_hrefs.append(a_tag['href'])

In [3]:
len(people_hrefs)

1435

In [4]:
[x for x in people_hrefs if len(x.split('/')) < 4]

['/people/',
 '/people/Fatemeh.Davoudi',
 '/people/denise.dawkins',
 '/people/aharon.degrassi',
 '/people/jane.dodge',
 '/people/david.dodick',
 '/people/isarin.durongkadej',
 '/people/xiaojia.hou',
 '/people/gaojian.huang',
 '/people/yinghua.huang',
 '/people/adam.svec',
 '/people/gabrielajohn.swamy',
 '/people/neil.switz',
 '/people/wendy.thompsontaiwo',
 '/people/chingching.tan',
 '/people/ravindra.thadani',
 '/people/megan.thiele',
 '/people/babu.thomas',
 '/people/glenn.thomas']

In [5]:
people_hrefs.remove('/people/')

## Get their RMP profile

In [10]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def get_first_teacher_result(text):
    url = 'https://www.ratemyprofessors.com/graphql'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Basic dGVzdDp0ZXN0',
    }
    
    query = """
    query NewSearchTeachersQuery($query: TeacherSearchQuery!, $count: Int) {
      newSearch {
        teachers(query: $query, first: $count) {
          edges {
            node {
              legacyId
              firstName
              lastName
              avgRating
              numRatings
              school {
                legacyId
                name
              }
            }
          }
        }
      }
    }
    """
    
    variables = {
        "query": {
            "text": text,
            "schoolID": "U2Nob29sLTg4MQ=="
        },
        "count": 1
    }
    
    response = requests.post(url, headers=headers, json={"query": query, "variables": variables}, timeout=10)
    
    if response.status_code == 200:
        data = response.json()
        edges = data.get('data', {}).get('newSearch', {}).get('teachers', {}).get('edges', [])
        if edges:
            teacher = edges[0]['node']
            return {
                'legacyId': teacher['legacyId'],
                'firstName': teacher['firstName'],
                'lastName': teacher['lastName'],
                'avgRating': teacher['avgRating'],
                'numRatings': teacher['numRatings'],
                'schoolName': teacher['school']['name']
            }
        else:
            return None
    else:
        response.raise_for_status()

In [11]:
ok_teachers = {}
err_teachers = []
for href in tqdm(people_hrefs):
    name = href.split('/')[2].replace('.', ' ')
    response = get_first_teacher_result(name)
    if (response 
        and response['schoolName'] == 'San Jose State University'
        and response['firstName'].lower() in name
        and response['lastName'].lower() in name
    ):
        del response['schoolName']
        ok_teachers[name] = response
    else:
        err_teachers.append(name)
    

100%|██████████| 1434/1434 [08:23<00:00,  2.85it/s]


In [32]:
with open('teacher_data/20240827_auto_match.json', 'w') as f:
    json.dump(ok_teachers, f)

In [19]:
with open('teacher_data/20240827_sjsu_people_hrefs.txt', 'w') as f:
    for href in people_hrefs:
        f.write(f'{href}\n')

## Get information for the teachers that failed to match automatically

There are some teachers that we can't match automatically, they are in `err_teachers`. I have manually mapped them, the result is in the file `20240827_manual_match.csv`.

In [51]:
name_id_list = []

with open('teacher_data/20240827_manual_match.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)  # Skip the header row

    for row in reader:
        name, rmp_id = row
        if rmp_id:  # Only add to the list if rmp_id is not empty
            rmp_id = int(rmp_id)
            name_id_list.append((name, rmp_id))

In [56]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def get_teacher_info(legacy_id):
    url = 'https://www.ratemyprofessors.com/graphql'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Basic dGVzdDp0ZXN0',
    }
    
    # Encode the legacyId as Base64
    encoded_id = base64.b64encode(f'Teacher-{legacy_id}'.encode()).decode()
    
    query = """
    query TeacherRatingsPageQuery($id: ID!) {
      node(id: $id) {
        ... on Teacher {
          legacyId
          firstName
          lastName
          avgRating
          numRatings
        }
      }
    }
    """
    
    variables = {
        "id": encoded_id
    }
    
    response = requests.post(url, headers=headers, json={"query": query, "variables": variables})
    
    if response.status_code == 200:
        data = response.json()
        teacher_info = data.get('data', {}).get('node', {})
        if teacher_info:
            return {
                'legacyId': teacher_info['legacyId'],
                'firstName': teacher_info['firstName'],
                'lastName': teacher_info['lastName'],
                'avgRating': teacher_info.get('avgRating', 'N/A'),
                'numRatings': teacher_info.get('numRatings', 'N/A')
            }
        else:
            return None
    else:
        response.raise_for_status()


In [59]:
manual_teachers = {}
for name, rmp_id in tqdm(name_id_list):
    response = get_teacher_info(rmp_id)
    manual_teachers[name] = response

100%|██████████| 144/144 [00:44<00:00,  3.20it/s]


In [62]:
all_teachers = {**ok_teachers, **manual_teachers}

In [None]:
len(all_teachers)

1226

In [None]:
len(ok_teachers) + len(manual_teachers)

1226

In [64]:
with open('teacher_data/20240827_all_teachers_current.json', 'w') as f:
    json.dump(all_teachers, f)