In [1]:
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd

## Step 1: Practice getting features from a single private university/college: Abilene Christian University
- Note: commented out code was left to show the initial way(s) I extracted features from the case study, Abilene Christian University. I ultimately didn't use that code because it didn't work on all schools.

In [137]:
url = 'https://nces.ed.gov/collegenavigator/?q=abilene&s=all&l=93&ct=2&id=222178' 

response = requests.get(url)

In [138]:
response.status_code  #200 = success!

200

In [139]:
page = response.text

In [140]:
soup = BeautifulSoup(page, "lxml")


#### Features from initial table at top of institution's page
- Institution's name
- Student-to-faculty ratio

In [141]:
private_university_string = soup.find('span',class_='headerlg').text
private_university_string

'Abilene Christian University'

In [142]:
#stu_fac_ratio = soup.find('table', class_='layouttab').find_all('td')[-1].text
#print(stu_fac_ratio)

In [143]:
# create function for extracting stu-to-fac ratio

def get_stu_to_fac_ratio(soup):

    obj = soup.find('table', class_='layouttab').find_all('td')
    
    if not obj: 
        return None
    
    ls = list(obj)
    
    row_ls = [row.text for row in ls]
    
    if row_ls.count('Student-to-faculty ratio:\xa0\xa0') == 0:
        return None
    
    idx = row_ls.index('Student-to-faculty ratio:\xa0\xa0')
                       
    if idx:
        return row_ls[idx+1]
    else:
        return None

In [144]:
get_stu_to_fac_ratio(soup)

'13 to 1'

#### Features from 'TUITION, FEES AND ESTIMATED STUDENT EXPENSES' collapsable table
- tuition and fees in 2019-20
- books and supplies in 2019-20
- tuition and fees in 2020-21
- books and supplies in 2020-21
- tuition in 2020-21
- fees in 2020-21

In [145]:
#tuition_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl00')
#tuition_rows = [row for row in tuition_table.find_all('tr')]
#tuition_rows[1]

In [146]:
tuition_fees_19_20 = soup.find(text='Tuition and fees').findNext().findNext().findNext().text
#tuition_19_20 = int(tuition_rows[1].find_all('td')[3].text.replace('$','').replace(',',''))
print(tuition_fees_19_20)
books_supplies_19_20 = soup.find(text='Books and supplies').findNext().findNext().findNext().text
#books_19_20 = int(tuition_rows[2].find_all('td')[3].text.replace('$','').replace(',',''))
print(books_supplies_19_20)
tuition_fees_20_21 = soup.find(text='Tuition and fees').findNext().findNext().findNext().findNext().text
#tuition = int(tuition_rows[1].find_all('td')[4].text.replace('$','').replace(',',''))
print(tuition_fees_20_21)
books_supplies_20_21 = soup.find(text='Books and supplies').findNext().findNext().findNext().findNext().text
#books = int(tuition_rows[2].find_all('td')[4].text.replace('$','').replace(',',''))
print(books_supplies_20_21)

$36,300
$1,250
$37,800
$1,250


#### The functions below are to extract price of tuition and fees and price of books and supplies in 2019-20 and 2020-21. The functions include code for dealing with cases where these values don't exist on a given school's page.

In [147]:
# create function for extracting price of tuition and fees in 2019-20 
    # and books and supplies in 2019-20

def get_tuition_books_19_20_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [148]:
get_tuition_books_19_20_value(soup, 'Tuition and fees')

'$36,300'

In [149]:
get_tuition_books_19_20_value(soup, 'Books and supplies')

'$1,250'

In [150]:
# create function for extracting price of tuition and fees in 2020-21 
    # and books and supplies in 2020-21
    
def get_tuition_books_20_21_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [151]:
get_tuition_books_20_21_value(soup, 'Tuition and fees')

'$37,800'

In [152]:
get_tuition_books_20_21_value(soup, 'Books and supplies')

'$1,250'

In [153]:
# create function for extracting price of tuition in 2020-21 
    
def get_tuition_20_21_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [154]:
get_tuition_20_21_value(soup, 'Undergraduate student tuition and fees')

In [155]:
# create function for extracting price of fees in 2020-21 
    
def get_fees_20_21_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [156]:
get_fees_20_21_value(soup, 'Undergraduate student tuition and fees')

In [157]:
# create function for extracting price of tuition in 2020-21 IF SCHOOL ONLY HAS UNDERGRAD
    
def get_tuition_under_20_21_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [158]:
get_tuition_under_20_21_value(soup, 'Average undergraduate student tuition and fees for academic year')

In [159]:
# create function for extracting price of fees in 2020-21 IF SCHOOL ONLY HAS UNDERGRAD 
    
def get_fees_under_20_21_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext().findNext().findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [160]:
get_fees_under_20_21_value(soup, 'Average undergraduate student tuition and fees for academic year')

#### Features from 'GENERAL INFORMATION' collapsable table
- number of full-time faculty
- number part-time faculty

In [161]:
#faculty_table = soup.find(id='ctl00_cphCollegeNavBody_ucInstitutionMain_divFaculty')
#faculty_table = [row for row in faculty_table.find_all('tr')]
#faculty_table[1]

In [162]:
ft_faculty = soup.find(text='Total faculty').findNext().text
#ft_faculty = int(faculty_table[1].find_all('td')[1].text)
pt_faculty = soup.find(text='Total faculty').findNext().findNext().text
#pt_faculty = int(faculty_table[1].find_all('td')[2].text)

#### The functions below are to extract various features that are located in the same column on a table (if the table exists). The functions include code for dealing with cases where these values don't exist on a given school's page.

In [163]:
# create function to grab # of full-time faculty; total enrollment; 
    # total enrollment (all undergraduate); undergrad enrollment; 
    # undergrad transfer-in enrollment; grad enrollment; number of undergrad applicants;
    # percent admitted; # of first-time students who submitted their SAT score;
    # # of first-time students who submitted their ACT score; 
    # scores for the SAT Evidence-Based Reading and Writing, SAT Math, ACT Composite, 
    # ACT English and ACT Math below which 25% of students scored

def get_school_value_one_next(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext()
    
    if element:
        return element.text 
    else:
        return None

In [164]:
get_school_value_one_next(soup, 'Total faculty')

'277'

In [165]:
# create function to grab # of part-time faculty; $ of full-time beginning under grad
    # students receiving grant or scholarship aid in 2019-20;
    # scores for the SAT Evidence-Based Reading and Writing, SAT Math, ACT Composite, 
    # ACT English and ACT Math below which 75% of students scores

def get_school_value_two_nexts(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [166]:
get_school_value_two_nexts(soup, 'Total faculty')

'137'

#### Features from 'FINANCIAL AID' collapsable table - ONLY 2019-20 DATA
- number of all undergraduate students receiving grant or scholarship aid
- average amount of grant or scholarship aid among all undergraduate students

In [167]:
#aid_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl01')
#aid_rows = [row for row in aid_table.find_all('tr')]
#aid_rows[13].find_all('td')

In [168]:
#grant_schol_aid_19_20 = int(aid_rows[12].find_all('td')[1].text.replace(',',''))
#print(grant_scholarship_aid_19_20)
#avg_amt_grant_schol_aid_19_20 = int(aid_rows[12].find_all('td')[3].text.replace('$','').replace(',',''))
#print(avg_amt_grant_scholarship_aid_19_20)

#### The functions below are to extract the average amount of grant or scholarship aid received by a) full-time beginning undergrad students and b) all undergrad students. The functions include code for dealing with cases where these values don't exist on a given school's page.

In [169]:
def get_avg_amt_grant_schol_aid_beg_under_value(soup, field_name):
    
    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().findNext().findNext().findNext()
    
    if element:
        return element.text 
    else:
        return None

In [170]:
get_avg_amt_grant_schol_aid_beg_under_value(soup,'Grant or scholarship aid')

'$22,887'

In [171]:
def get_avg_amt_grant_schol_aid_all_under_value(soup, field_name):

    obj = soup.find(text=field_name)
    
    if not obj: 
        return None
    
    element = obj.findNext().find_all('td')[3]
    
    if element:
        return element.text 
    else:
        return None

In [172]:
get_avg_amt_grant_schol_aid_all_under_value(soup, 'All Undergraduate Students')

'$21,092'

#### Features from 'ENROLLMENT' collapsable table
- total enrollment
- undergraduate enrollment
- undergraduate transfer-in enrollment
- graduate enrollment

In [173]:
#enroll_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')
#enroll_rows = [row for row in enroll_table.find_all('tr')]
#enroll_rows[0]

In [174]:
#total_enroll = int(enroll_rows[0].find_all('th')[1].text.replace(',',''))
#print(total_enroll)
#under_enroll = int(enroll_rows[1].find_all('td')[1].text.replace(',',''))
#print(under_enroll)
#under_trans_enroll = int(enroll_rows[2].find_all('td')[1].text.replace(',',''))
#print(under_trans_enroll)
#grad_enroll = int(enroll_rows[3].find_all('td')[1].text.replace(',',''))
#print(grad_enroll)

In [175]:
#def get_total_enroll_value(soup, element_id):
    
 #   '''Grab the total enrollment'''
    
 #  table = soup.find(id=element_id)
 
 #   if not table: 
 #       return None
    
    # this works for most of the values
 #   rows = [row for row in table.find_all('tr')]
    
 #   if rows:
 #       return rows[0].find_all('th')[1].text
 #   else:
 #       return None

In [176]:
#get_total_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

In [177]:
#def get_under_enroll_value(soup, element_id):
    
#    '''Grab the undergraduate enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[1].find_all('td')[1].text
#    else:
#        return None

In [178]:
#get_under_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

In [179]:
#def get_under_trans_enroll_value(soup, element_id):
    
#    '''Grab the undergraduate transfer-in enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[2].find_all('td')[1].text
#    else:
#        return None

In [180]:
#get_under_trans_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

In [181]:
#def get_grad_enroll_value(soup, element_id):
    
#    '''Grab the graduate enrollment'''
    
#    table = soup.find(id=element_id)
    
#    if not table: 
#        return None
    
    # this works for most of the values
#    rows = [row for row in table.find_all('tr')]
    
#    if rows:
#        return enroll_rows[3].find_all('td')[1].text
#    else:
#        return None

In [39]:
#get_grad_enroll_value(soup, 'divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')

#### Features from 'ADMISSIONS' collapsable table 
- number of undergraduate applicants
- % of undergraduate applicants admitted
- number of first-time students who submitted their SAT score
- % of first-time students who submitted their SAT score
- number of first-time students who submitted their ACT score
- % of first-time students who submitted their ACT score
- SAT Evidence-Based Reading and Writing 25th percentile score
- SAT Evidence-Based Reading and Writing 75th percentile score
- SAT Math 25th percentile score
- SAT Math 75th percentile score
- ACT Composite 25th percentile score
- ACT Composite 75th percentile score
- ACT English 25th percentile score
- ACT English 75th percentile score
- ACT Math 25th percentile score
- ACT Math 75th percentile score

In [182]:
#admissions_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl04')
#admissions_rows = [row for row in admissions_table.find_all('tr')]

In [183]:
#under_apps = int(admissions_rows[3].find_all('td')[1].text.replace(',',''))
#print(applicants)
#perc_admitted = float(admissions_rows[4].find_all('td')[2].text.replace('%',''))
#print(perc_admitted)

In [184]:
sat_scores_submit = int(soup.find(text='SAT').findNext().text)
print(sat_scores_submit)
perc_sat_scores_submit = float(soup.find(text='SAT').findNext().findNext().text.replace('%',''))
print(perc_sat_scores_submit)
act_scores_submit = int(soup.find(text='ACT').findNext().text)
print(act_scores_submit)
perc_act_scores_submit = float(soup.find(text='ACT').findNext().findNext().text.replace('%',''))
print(perc_act_scores_submit)
sat_read_write_25_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().text)
print(sat_read_write_25_perc)
sat_read_write_75_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().findNext().text)
print(sat_read_write_75_perc)
sat_math_25_perc = int(soup.find(text='SAT Math').findNext().text)
print(sat_math_25_perc)
sat_math_75_perc = int(soup.find(text='SAT Math').findNext().findNext().text)
print(sat_math_75_perc)
act_comp_25_perc = int(soup.find(text='ACT Composite').findNext().text)
print(act_comp_25_perc)
act_comp_75_perc = int(soup.find(text='ACT Composite').findNext().findNext().text)
print(act_comp_75_perc)
act_eng_25_perc = int(soup.find(text='ACT English').findNext().text)
print(act_eng_25_perc)
act_eng_75_perc = int(soup.find(text='ACT English').findNext().findNext().text)
print(act_eng_75_perc)
act_math_25_perc = int(soup.find(text='ACT Math').findNext().text)
print(act_math_25_perc)
act_math_75_perc = int(soup.find(text='ACT Math').findNext().findNext().text)
print(act_math_75_perc)

432
50.0
369
43.0
510
620
500
600
21
27
20
28
18
26


## Step 2: Create helper functions to parse strings into numerical values

In [185]:
def money_to_int(moneystring):
    try:
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
    except:
        return None

def percent_to_float(percentstring):
    try:
        percentstring = percentstring.replace('%','')
        return float(percentstring)
    except:
        return None

def number_to_int(numberstring):
    try:
        numberstring = numberstring.replace(',', '')
        return int(numberstring)
    except:
        return None

### Step 3: Scrape individual school names and their corresponding NCES link stubs

- NOTE: Through trial and error, I discovered that you can only view up to 15 schools on a page and up to 34 pages in a particular search for schools. To get all the private non-profit universities and colleges offering Bachelor's degrees in the US, I ended up using the map in the search section to select groups of states by region.

#### Step 1: Create a list of urls for the first page of private schools in each region (NCES divides the country into 8 regions)

In [44]:
# create a list of the urls for each region (Far West, Rocky Mountains, Southwest, Plains, Great Lakes, Southeast
                # Mideast (including DC) and New England)

regions_urls = ['https://nces.ed.gov/collegenavigator/?s=AK+CA+HI+NV+OR+WA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CO+ID+MT+UT+WY&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AZ+NM+OK+TX&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IA+KS+MN+MO+NE+ND+SD&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IL+IN+MI+OH+WI&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AL+AR+FL+GA+KY+LA+MS+NC+SC+TN+VA+WV&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=DE+DC+MD+NJ+NY+PA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CT+ME+MA+NH+RI+VT&l=93&ct=2']                

#### Step 2: Loop through the urls for each region to extract the number of pages worth of results (results = school name and link)

In [45]:
# loop through each region's url to get the count of the # of pages' worth of schools 
    #each region has - save these 8 numbers (one for each of the 8 regions) in a list

pages_regions = []

for url_reg in regions_urls:
    response_reg = requests.get(url_reg)
    time.sleep(1)
    soup_reg = BeautifulSoup(response_reg.text, 'html5lib')
    pages_str = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_divPagingControls').find_all('a')[-1]['href']
    pages_idx = pages_str.find('&pg=')
    pages = pages_str[pages_idx+4:]
    pages_regions.append(int(pages))

#### Step 3: Loop through the list of region urls and for each url, create a new url for each additional page worth of results (e.g., if the first region in the region_urls list - Far West - has 8 pages' worth of results, create 7 new urls that append '&pg=' and the number for the page) and append the newly created urls to the original list of region urls.

In [46]:
# loop through the list of each region's url and each region's corresponding number of pages' 
    # worth of schools to create a new url for each region's page(s) 
    # and add the new url to the original list of initial region urls

for idx in range(0, len(regions_urls)):
    for i in range(2, pages_regions[idx]+1):
        new_url = (''.join(regions_urls[idx] + '&pg=' + str(i)))
        regions_urls.append(new_url)

#### Step 4: Loop through the updated list of region urls to extract the name of each school and its link stub and save these extractions in a new dictionary.

In [47]:
# loop through each region's url to create a dictionary of all school names 
    # and their corresponding link stub

schools = {}
names = []
urls = []

for reg_url in regions_urls:
    response_reg = requests.get(reg_url)
    time.sleep(1)
    soup_reg = BeautifulSoup(response_reg.text, "lxml")
    table_reg = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_tblResults')
    rows_reg = [row for row in table_reg.find_all('tr')]

    for row in rows_reg:
        link = row.find('a')
        if link != None:
            name, url = link.text, link['href']
            names.append(name)
            urls.append(url)
            for idx in range(1, len(names)):
                if names[idx] == names[idx-1]:
                    names[idx] = names[idx] + '_b'

for a, b in zip(names, urls):
    schools[a] = [b]

In [48]:
# check first five key, value pairs in new schools dict to make sure it's correct

{k: schools[k] for k in list(schools)[:5]}

{'Alaska Bible College': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580'],
 'Alaska Pacific University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669'],
 'America Evangelical University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081'],
 'American Jewish University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846'],
 'Antioch University-Los Angeles': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838']}

#### Step 5. Convert the dictionary of schools and their corresponding link stubs into a dataframe, which will be used in the function to extract features from each school's page.

In [49]:
# create a dataframe out of the dictionary containing each school and their link stub

schools_df = pd.DataFrame(schools).T  #transpose
schools_df.columns = ['link_stub']

schools_df.head()

Unnamed: 0,link_stub
Alaska Bible College,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580
Alaska Pacific University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669
America Evangelical University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081
American Jewish University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846
Antioch University-Los Angeles,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838


### Step 4. Scrape features from individual school pages

In [191]:
def get_school_dict(link):
    '''
    From each school's NCES link stub, request school html, parse with BeautifulSoup, and
    collect the following features:
        - school name 
        - student-to-faculty ratio as string
        - tuition and fees in 2019-20
        - books and supplies in 2019-20
        - tuition and fees in 2020-21
        - books and supplies in 2020-21
        - number of full-time faculty
        - number part-time faculty
        - average amount of grant or scholarship aid among first-time beginning undergraduate students
        - % of first-time beginning undergraduate students receiving grant or scholarship aid
        - average amount of grant or scholarship aid among all undergraduate students
        - total enrollment
        - undergraduate enrollment
        - undergraduate transfer-in enrollment
        - graduate enrollment
        - number of undergraduate applicants
        - percent of undergraduate applicants admitted
        - number of first-time students who submitted their SAT score
        - % of first-time students who submitted their SAT score
        - number of first-time students who submitted their ACT score
        - % of first-time students who submitted their ACT score
        - SAT Evidence-Based Reading and Writing 25th percentile score
        - SAT Evidence-Based Reading and Writing 75th percentile score
        - SAT Math 25th percentile score
        - SAT Math 75th percentile score
        - ACT Composite 25th percentile score
        - ACT Composite 75th percentile score
        - ACT English 25th percentile score
        - ACT English 75th percentile score
        - ACT Math 25th percentile score
        - ACT Math 75th percentile score
    Return information as a dictionary.
    '''
    
    base_url = 'https://nces.ed.gov/collegenavigator/'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url, timeout=5)
    #time.sleep(4)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['school', 'raw_stu_to_fac_ratio', 'tuition_fees_19_20', 'books_supplies_19_20', 
               'tuition_fees', 'books_supplies', 'tuition', 'fees', 'tuition_under','fees_under',
               'ft_faculty', 'pt_faculty','avg_amt_grant_schol_aid_beg_under_19_20',
               'perc_grant_schol_aid_beg_under_19_20', 'avg_amt_grant_schol_aid_all_under_19_20', 'total_enroll',
               'total_enroll_under', 'under_enroll', 'under_trans_enroll','grad_enroll', 'under_apps', 'perc_admitted',
               'sat_scores_submit', 'perc_sat_scores_submit','act_scores_submit','perc_act_scores_submit',
               'sat_read_write_25_perc', 'sat_read_write_75_perc','sat_math_25_perc','sat_math_75_perc', 
               'act_comp_25_perc','act_comp_75_perc', 'act_eng_25_perc','act_eng_75_perc', 'act_math_25_perc', 
               'act_math_75_perc']
    
    #Get school name
    school = soup.find('span',class_='headerlg').text

    #Get student to faculty ratio
    raw_stu_fac_ratio = get_stu_to_fac_ratio(soup)
    
    #Get price of tuition and fees or just tuition or just fees
    raw_tuition_fees_19_20 = get_tuition_books_19_20_value(soup, 'Tuition and fees')
    tuition_fees_19_20 = money_to_int(raw_tuition_fees_19_20)
    
    raw_tuition_fees = get_tuition_books_20_21_value(soup, 'Tuition and fees')
    tuition_fees = money_to_int(raw_tuition_fees)
    
    raw_tuition = get_tuition_20_21_value(soup, 'Undergraduate student tuition and fees')
    tuition = money_to_int(raw_tuition)
    
    raw_fees = get_fees_20_21_value(soup, 'Undergraduate student tuition and fees')
    fees = money_to_int(raw_fees)
    
    raw_tuition_under = get_tuition_under_20_21_value(soup, 'Average undergraduate student tuition and fees for academic year')
    tuition_under = money_to_int(raw_tuition_under)
    
    raw_fees_under = get_fees_under_20_21_value(soup, 'Average undergraduate student tuition and fees for academic year')
    fees_under = money_to_int(raw_fees_under)
    
    #Get price of books and supplies
    raw_books_supplies_19_20 = get_tuition_books_19_20_value(soup, 'Books and supplies')
    books_supplies_19_20 = money_to_int(raw_books_supplies_19_20)

    raw_books_supplies = get_tuition_books_20_21_value(soup, 'Books and supplies')
    books_supplies = money_to_int(raw_books_supplies)

    #Get number of full-time and part-time faculty
    raw_ft_faculty = get_school_value_one_next(soup, 'Total faculty')
    ft_faculty = number_to_int(raw_ft_faculty)
    
    raw_pt_faculty = get_school_value_two_nexts(soup, 'Total faculty')
    pt_faculty = number_to_int(raw_pt_faculty)

    #Get financial aid metrics
    raw_avg_amt_grant_schol_aid_beg_under_19_20 = get_avg_amt_grant_schol_aid_beg_under_value(soup,'Grant or scholarship aid')
    avg_amt_grant_schol_aid_beg_under_19_20 = money_to_int(raw_avg_amt_grant_schol_aid_beg_under_19_20)
        
    raw_perc_grant_schol_aid_beg_under_19_20 = get_school_value_two_nexts(soup,'Grant or scholarship aid')
    perc_grant_schol_aid_beg_under_19_20 = percent_to_float(raw_perc_grant_schol_aid_beg_under_19_20)
    
    raw_avg_amt_grant_schol_aid_all_under_19_20 = get_avg_amt_grant_schol_aid_all_under_value(soup, 'All Undergraduate Students')
    avg_amt_grant_schol_aid_all_under_19_20 = money_to_int(raw_avg_amt_grant_schol_aid_all_under_19_20)
    
    #Get enrollment metrics
    raw_total_enroll = get_school_value_one_next(soup, 'Total enrollment')
    total_enroll = number_to_int(raw_total_enroll)
    
    raw_total_enroll_under = get_school_value_one_next(soup, 'Total enrollment (all undergraduate)') 
    total_enroll_under = number_to_int(raw_total_enroll_under) # combine this column with total_enroll
    
    raw_under_enroll = get_school_value_one_next(soup, 'Undergraduate enrollment')
    under_enroll = number_to_int(raw_under_enroll)
    
    raw_under_trans_enroll = get_school_value_one_next(soup, 'Undergraduate transfer-in enrollment')
    under_trans_enroll = number_to_int(raw_under_trans_enroll)
    
    raw_grad_enroll = get_school_value_one_next(soup, 'Graduate enrollment')
    grad_enroll = number_to_int(raw_grad_enroll)
    
    #Get admissions metrics - # of applicants and % admitted
    raw_under_apps = get_school_value_one_next(soup, 'Number of applicants')
    under_apps = number_to_int(raw_under_apps)
    
    raw_perc_admitted = get_school_value_one_next(soup, 'Percent admitted')
    perc_admitted = percent_to_float(raw_perc_admitted)
    
    #Get admissions metrics - SAT and ACT
    raw_sat_scores_submit = get_school_value_one_next(soup, 'SAT')
    sat_scores_submit = number_to_int(raw_sat_scores_submit)
    
    raw_perc_sat_scores_submit = get_school_value_two_nexts(soup, 'SAT')
    perc_sat_scores_submit = percent_to_float(raw_perc_sat_scores_submit)
    
    raw_act_scores_submit = get_school_value_one_next(soup, 'ACT')
    act_scores_submit = number_to_int(raw_act_scores_submit)
    
    raw_perc_act_scores_submit = get_school_value_two_nexts(soup, 'ACT')
    perc_act_scores_submit = percent_to_float(raw_perc_act_scores_submit)
    
    raw_sat_read_write_25_perc = get_school_value_one_next(soup, 'SAT Evidence-Based Reading and Writing')
    sat_read_write_25_perc = number_to_int(raw_sat_read_write_25_perc)
    
    raw_sat_read_write_75_perc = get_school_value_two_nexts(soup, 'SAT Evidence-Based Reading and Writing')
    sat_read_write_75_perc = number_to_int(raw_sat_read_write_75_perc)
    
    raw_sat_math_25_perc = get_school_value_one_next(soup, 'SAT Math')
    sat_math_25_perc = number_to_int(raw_sat_math_25_perc)
    
    raw_sat_math_75_perc = get_school_value_two_nexts(soup, 'SAT Math')
    sat_math_75_perc = number_to_int(raw_sat_math_75_perc)
    
    raw_act_comp_25_perc = get_school_value_one_next(soup, 'ACT Composite')
    act_comp_25_perc = number_to_int(raw_act_comp_25_perc)
    
    raw_act_comp_75_perc = get_school_value_two_nexts(soup, 'ACT Composite')
    act_comp_75_perc = number_to_int(raw_act_comp_75_perc)
    
    raw_act_eng_25_perc = get_school_value_one_next(soup, 'ACT English')
    act_eng_25_perc = number_to_int(raw_act_eng_25_perc)
    
    raw_act_eng_75_perc = get_school_value_two_nexts(soup, 'ACT English')
    act_eng_75_perc = number_to_int(raw_act_eng_75_perc)
    
    raw_act_math_25_perc = get_school_value_one_next(soup, 'ACT Math')
    act_math_25_perc = number_to_int(raw_act_math_25_perc)
    
    raw_act_math_75_perc = get_school_value_two_nexts(soup, 'ACT Math')
    act_math_75_perc = number_to_int(raw_act_math_75_perc)
    
    #Create school dictionary and return
    school_dict = dict(zip(headers, [school,
                                    raw_stu_fac_ratio,
                                    tuition_fees_19_20,
                                    books_supplies_19_20, 
                                    tuition_fees,
                                    books_supplies,
                                    tuition,
                                    fees,
                                    tuition_under,
                                    fees_under,
                                    ft_faculty,
                                    pt_faculty,
                                    avg_amt_grant_schol_aid_beg_under_19_20,
                                    perc_grant_schol_aid_beg_under_19_20,
                                    avg_amt_grant_schol_aid_all_under_19_20,
                                    total_enroll,
                                    total_enroll_under,
                                    under_enroll, 
                                    under_trans_enroll, 
                                    grad_enroll,
                                    under_apps,
                                    perc_admitted,
                                    sat_scores_submit,
                                    perc_sat_scores_submit,
                                    act_scores_submit,
                                    perc_act_scores_submit,
                                    sat_read_write_25_perc,
                                    sat_read_write_75_perc,
                                    sat_math_25_perc,
                                    sat_math_75_perc,
                                    act_comp_25_perc,
                                    act_comp_75_perc,
                                    act_eng_25_perc,
                                    act_eng_75_perc,
                                    act_math_25_perc,
                                    act_math_75_perc]))

    return school_dict

In [201]:
# apply the above created function to extract data for every link stub in schools_df
    # (the df containing school name and link stub) and append the extracted data to a newly
    # created list, which will ultimately become a list of dicts, one dict for each school

private_schools_page_info_list = []

for link in schools_df.link_stub[400:800]:
    private_schools_page_info_list.append(get_school_dict(link))

In [202]:
print(len(schools_df))
print(len(private_schools_page_info_list))
print(len(schools_df) - len(private_schools_page_info_list))

1332
400
932


In [203]:
#convert list of dictionaries to df to be used for modeling

private_schools_page_info = pd.DataFrame(private_schools_page_info_list)
private_schools_page_info.set_index('school', inplace=True)

In [204]:
private_schools_page_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, Midwestern Baptist Theological Seminary to Livingstone College
Data columns (total 35 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   raw_stu_to_fac_ratio                     397 non-null    object 
 1   tuition_fees_19_20                       380 non-null    float64
 2   books_supplies_19_20                     361 non-null    float64
 3   tuition_fees                             381 non-null    float64
 4   books_supplies                           362 non-null    float64
 5   tuition                                  12 non-null     float64
 6   fees                                     12 non-null     float64
 7   tuition_under                            5 non-null      float64
 8   fees_under                               5 non-null      float64
 9   ft_faculty                               394 non-null    float64
 10  p

In [205]:
private_schools_page_info.to_csv('schools_csv_2.csv')

In [574]:
# loop through each school's page to extract the student-to-faculty ratio and save the value in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [576]:
#stu_fac_ratio_ls[]

1332

In [577]:
#len(schools)

1332

In [None]:
#time.sleep(3)

In [None]:
# loop through each school's page to grab the html from that page and save it in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [None]:
# code for looping through one state's url and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_string = 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2'

#for abrev in state_abrev:
   # ca_list = list(ca_string)
   # idx_1 = ca_list.index('=')
   # idx_2 = ca_list.index('&')
    #print(''.join(ca_list[:idx_1+1]) + abrev + ''.join(ca_list[idx_2:]))

In [523]:
# code for looping through the multiple pages of a state's urls and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_schools_search_tables_urls = [
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=3',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=4',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=5',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=6',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=7',
  #  'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=8']

#for abrev in state_abrev:
  #  for url in ca_schools_search_tables_urls:
   #     idx_1 = url.index('=')
   #     idx_2 = url.index('&')
  #      print(''.join(url[:idx_1+1]) + abrev + ''.join(url[idx_2:]))