In [98]:
import requests
from bs4 import BeautifulSoup
import json
import sys
import time
from tqdm import tqdm

multiple advisors example

In [2]:
mult_advisors = "https://www.genealogy.math.ndsu.nodak.edu/id.php?id=102043"

In [3]:
first_pull_addr = "https://www.genealogy.math.ndsu.nodak.edu/id.php?id=230516"

In [4]:
ex = "https://www.genealogy.math.ndsu.nodak.edu/id.php?id=42016"

In [5]:
first_pull = requests.get(first_pull_addr)

In [6]:
mult_adv = requests.get(mult_advisors)

In [7]:
ex_pull = requests.get(ex)

In [8]:
bs = BeautifulSoup(first_pull.text)

pull
- id
- name
- degree
- uni
- country
- year
- link to advisors
- links to advisees

In [9]:
bs.find

<bound method Tag.find of <!DOCTYPE html>
<html><head><meta charset="utf-8"/>
<meta content="IE=EmulateIE7" http-equiv="X-UA-Compatible"/>
<title>Gautam Sisodia - The Mathematics Genealogy Project</title>
<style type="text/css"> 
body  {
	margin: 0; /* it's good practice to zero the margin and padding of the body element to account for differing browser defaults */
	padding: 0;
	text-align: center; /* this centers the container in IE 5* browsers. The text is then set to the left aligned default in the #container selector */
	color: #000000;
	font-family: Arial, Helvetica, sans-serif;
	font-size: 100%;
	background-color: #5E8059;
}
.twoColFixLtHdr #container {
	width: 780px;  /* using 20px less than a full 800px width allows for browser chrome and avoids a horizontal scroll bar */
	background: #FFFFFF; /* the auto margins (in conjunction with a width) center the page */
	border: 1px solid #000000;
	text-align: left; /* this overrides the text-align: center on the body element. */
	margi

In [16]:
person_id = bs.find("a", text = "update form")["href"].split("=")[1].split("&")[0]
person_id

'230516'

In [17]:
name = bs.find("h2").text.strip()
name

'Gautam  Sisodia'

In [18]:
deg_uni_year = bs.find("span").text
deg_uni_year

'Ph.D. University of Washington 2014'

In [19]:
img_alt = "|".join([i["alt"] for i in bs.find_all("img")])
img_alt

'Tree|UnitedStates'

In [20]:
advs = [{"name": a.text, "link": a["href"]} for p in bs.find_all("p") for a in p.find_all("a") if a["href"][:2] == "id" and "Advisor" in p.text]
advs

[{'link': 'id.php?id=28320', 'name': 'Sholto Paul Smith'}]

In [21]:
students = [{"name": a.text, "link": a["href"]} for t in bs.find_all("table") for a in t.find_all("a")]
students

[]

In [60]:
all_info = {
    "id": person_id,
    "name": name,
    "deg_uni_year": deg_uni_year,
    "img_alt": img_alt,
    "advisors": advs,
    "students": students
}

In [82]:
all_info

{'advisors': [{'link': 'id.php?id=28320', 'name': 'Sholto Paul Smith'}],
 'deg_uni_year': 'Ph.D. University of Washington 2014',
 'id': '230516',
 'img_alt': 'Tree|UnitedStates',
 'name': 'Gautam  Sisodia',
 'students': []}

In [83]:
json.dump([all_info], open("./math_gen.json", "w"))

In [84]:
mg = json.load(open("./math_gen.json", "r"))

In [85]:
mg

[{'advisors': [{'link': 'id.php?id=28320', 'name': 'Sholto Paul Smith'}],
  'deg_uni_year': 'Ph.D. University of Washington 2014',
  'id': '230516',
  'img_alt': 'Tree|UnitedStates',
  'name': 'Gautam  Sisodia',
  'students': []}]

In [86]:
def get_info(link):
    addr = "https://www.genealogy.math.ndsu.nodak.edu/" + link
    pull = requests.get(addr)
    bs = BeautifulSoup(pull.text)
    
    person_id = bs.find("a", text = "update form")["href"].split("=")[1].split("&")[0]
    
    name = bs.find("h2").text.strip()
    
    deg_uni_year = bs.find("span").text
    
    img_alt = "|".join([i["alt"] for i in bs.find_all("img")])
    
    advs = [
        {
            "name": a.text,
            "link": a["href"]
        } for p in bs.find_all("p") for a in p.find_all("a") if a["href"][:2] == "id" and "Advisor" in p.text]
    
    students = [{"name": a.text, "link": a["href"]} for t in bs.find_all("table") for a in t.find_all("a")]
    
    return {
        "id": person_id,
        "name": name,
        "deg_uni_year": deg_uni_year,
        "img_alt": img_alt,
        "advisors": advs,
        "students": students}

In [89]:
def find_next(mg, ind, checked_ind = [], nr = 1):
    elt = mg[ind]
    
    # if nr > 4:
    #     return None
    
    # search through students
    for student in elt["students"]:
        # try to find the student in the list
        search_mg = [i for i, e in enumerate(mg) if e["id"] == student["link"].split("=")[1]]
        if len(search_mg) == 0:
            return student["link"]
        else:
            if not search_mg[0] in checked_ind:
                student_next = find_next(mg, search_mg[0], checked_ind + [ind], nr + 1)
                if student_next:
                    return student_next
    
    # search through advisors
    for advisor in elt["advisors"]:
        # try to find the advisor in the list
        search_mg = [i for i, e in enumerate(mg) if e["id"] == advisor["link"].split("=")[1]]
        if len(search_mg) == 0:
            return advisor["link"]
        else:
            if not search_mg[0] in checked_ind:
                advisor_next = find_next(mg, search_mg[0], checked_ind + [ind], nr + 1)
                if advisor_next:
                    return advisor_next

    return None

In [58]:
find_next(mg, 0)

'id.php?id=28320'

In [45]:
a = get_info('id.php?id=42016')

In [46]:
a

{'advisors': [{'link': 'id.php?id=102043', 'name': 'Adam  Sedgwick'}],
 'deg_uni_year': 'M.A. University of Cambridge 1830',
 'id': '42016',
 'img_alt': 'Tree|UnitedKingdom',
 'name': 'William  Hopkins',
 'students': [{'link': 'id.php?id=7824', 'name': 'Cayley, Arthur'},
  {'link': 'id.php?id=30175', 'name': 'Galton, Sir Francis'},
  {'link': 'id.php?id=105806', 'name': 'Maxwell, James'},
  {'link': 'id.php?id=101929', 'name': 'Routh, Edward'},
  {'link': 'id.php?id=102483', 'name': 'Stokes, George'},
  {'link': 'id.php?id=234972', 'name': 'Thomson, William'},
  {'link': 'id.php?id=129420', 'name': 'Todhunter, Isaac'}]}

In [47]:
mg + [a]

[{'advisors': [{'link': 'id.php?id=28320', 'name': 'Sholto Paul Smith'}],
  'deg_uni_year': 'Ph.D. University of Washington 2014',
  'id': '230516',
  'img_alt': 'Tree|UnitedStates',
  'name': 'Gautam  Sisodia',
  'students': []},
 {'advisors': [{'link': 'id.php?id=102043', 'name': 'Adam  Sedgwick'}],
  'deg_uni_year': 'M.A. University of Cambridge 1830',
  'id': '42016',
  'img_alt': 'Tree|UnitedKingdom',
  'name': 'William  Hopkins',
  'students': [{'link': 'id.php?id=7824', 'name': 'Cayley, Arthur'},
   {'link': 'id.php?id=30175', 'name': 'Galton, Sir Francis'},
   {'link': 'id.php?id=105806', 'name': 'Maxwell, James'},
   {'link': 'id.php?id=101929', 'name': 'Routh, Edward'},
   {'link': 'id.php?id=102483', 'name': 'Stokes, George'},
   {'link': 'id.php?id=234972', 'name': 'Thomson, William'},
   {'link': 'id.php?id=129420', 'name': 'Todhunter, Isaac'}]}]

In [49]:
find_next(mg + [a], 0)

'id.php?id=28320'

Define the full process

In [96]:
def add_to_math_gen(path):
    # get current math gen data
    mg = json.load(open(path, "r"))
    
    # find the next link to add
    next_link = find_next(mg, 0)
    
    # print("grabbing from " + next_link)
    
    # get info from that link
    next_link_info = get_info(next_link)
    
    # print("got " + next_link_info["name"])
    
    # add info to math gen data
    next_mg = mg + [next_link_info]
    
    # print("data size " + str(len(next_mg)))
    
    # write to file
    json.dump(next_mg, open(path, "w"))

In [99]:
for i in tqdm(range(10)):
    add_to_math_gen("./math_gen.json")
    time.sleep(5)

100%|██████████| 10/10 [00:53<00:00,  5.31s/it]


In [100]:
mg = json.load(open("./math_gen.json", "r"))

In [101]:
len(mg)

36

In [102]:
[i["name"] for i in mg]

['Gautam  Sisodia',
 'Sholto Paul Smith',
 'Izuru  Mori',
 'Adam  Nyman',
 'Patrick Thomas Perkins',
 'Joanna  Staniszkis',
 'Michaela  Vancliff',
 'Richard  Chandler',
 'Manizheh  Nafari',
 'Padmini  Veerapan',
 'John Coulter McConnell',
 'Martin Philip Gilchrist',
 'J. Tobias Stafford',
 'Paul Leslie Check',
 'Severino Collier Coutinho',
 'Cecilia Fernanda  Saraiva de Oliveira',
 'Andrew Phillip Davies',
 'Siân  Fryer',
 'Tim  Jarrold',
 'David T. Kausch',
 'Dennis Shawn Keeler',
 'Kimberly A. Retert',
 'Daniel  Rogalski',
 'James  Berglund',
 'Susan  Elle',
 'Amy  Irwin Stout',
 'Robert  Won',
 'Susan J. Sierra',
 'Christopher  Campbell',
 'Simon  Crawford',
 'Darin R. Stephenson',
 'Kok-Ming  Teo',
 'Chelsea  Walton',
 'Karen Ellen Smith',
 'Manuel  Blickle',
 'Susanne Andrea Müller']

In [47]:
mg[1]

{'advisors': [{'link': 'id.php?id=102043', 'name': 'Adam  Sedgwick'}],
 'deg_uni_year': 'M.A. University of Cambridge 1830',
 'id': '42016',
 'img_alt': 'Tree|UnitedKingdom',
 'name': 'William  Hopkins',
 'students': [{'link': 'id.php?id=7824', 'name': 'Cayley, Arthur'},
  {'link': 'id.php?id=30175', 'name': 'Galton, Sir Francis'},
  {'link': 'id.php?id=105806', 'name': 'Maxwell, James'},
  {'link': 'id.php?id=101929', 'name': 'Routh, Edward'},
  {'link': 'id.php?id=102483', 'name': 'Stokes, George'},
  {'link': 'id.php?id=234972', 'name': 'Thomson, William'},
  {'link': 'id.php?id=129420', 'name': 'Todhunter, Isaac'}]}

In [48]:
mg[2]

{'advisors': [{'link': 'id.php?id=42016', 'name': 'William  Hopkins'}],
 'deg_uni_year': 'Ph.D. / Ph.D. / Dr Sc. Universiteit Leiden and University College Dublin and University of Oxford 1864/1865/1875',
 'id': '7824',
 'img_alt': 'Tree|Netherlands|Ireland|UnitedKingdom',
 'name': 'Arthur  Cayley',
 'students': [{'link': 'id.php?id=13135', 'name': 'Baker, Henry'},
  {'link': 'id.php?id=17829', 'name': 'Forsyth, Andrew'},
  {'link': 'id.php?id=6965', 'name': 'Scott, Charlotte'}]}

In [49]:
mg[3]

{'advisors': [{'link': 'id.php?id=7824', 'name': 'Arthur  Cayley'}],
 'deg_uni_year': 'Ph.D. University of Cambridge ',
 'id': '13135',
 'img_alt': 'Tree|UnitedKingdom',
 'name': 'Henry Frederick Baker',
 'students': [{'link': 'id.php?id=18500', 'name': 'Babbage, D.'},
  {'link': 'id.php?id=18503', 'name': 'Bronowski, Jacob'},
  {'link': 'id.php?id=18508', 'name': 'Cherry, Thomas'},
  {'link': 'id.php?id=12555', 'name': 'Coxeter, H. S. M.'},
  {'link': 'id.php?id=4933', 'name': 'Du Val, Patrick'},
  {'link': 'id.php?id=98482', 'name': 'Edge, William'},
  {'link': 'id.php?id=18504', 'name': 'Frith, R.'},
  {'link': 'id.php?id=234770', 'name': 'Jeffreys, Harold'},
  {'link': 'id.php?id=18584', 'name': 'Jeffreys, Sir Harold'},
  {'link': 'id.php?id=18507', 'name': 'Maxwell, Edwin'},
  {'link': 'id.php?id=44940', 'name': 'Mordell, Louis'},
  {'link': 'id.php?id=18499', 'name': 'Pedoe, Daniel'},
  {'link': 'id.php?id=18506', 'name': 'Room, Thomas'},
  {'link': 'id.php?id=18501', 'name': 'Se

In [50]:
mg[4]

{'advisors': [{'link': 'id.php?id=13135', 'name': 'Henry Frederick Baker'}],
 'deg_uni_year': 'Ph.D. University of Cambridge 1933',
 'id': '18500',
 'img_alt': 'Tree|UnitedKingdom',
 'name': 'D. W. Babbage',
 'students': []}