In [715]:
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd

#### Abilene Christian University as an example school from which to extract features for the final dataset

In [579]:
url = 'https://nces.ed.gov/collegenavigator/?s=all&l=93&ct=2&ic=1&id=222178' 

response = requests.get(url)

In [580]:
response.status_code  #200 = success!

200

In [581]:
response.text[:1000]  #First 1000 characters of the HTML

'\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" ><head id="ctl00_hd"><meta http-equiv="Content-type" content="text/html;charset=UTF-8" /><title>\r\n\tCollege Navigator - Abilene Christian University\r\n</title><link href="css/md0.css" type="text/css" rel="stylesheet" /><meta name="robots" content="noindex,nofollow"></meta><link href="css/collegenav.css?v=2.4" type="text/css" rel="stylesheet" /><link href="css/mapstyle.css" type="text/css" rel="stylesheet" /><!--[if lte IE 8]><link href="css/mainIE6.css" text="text/css" rel="stylesheet" /><![endif]--><link rel="search" type="application/opensearchdescription+xml" title="NCES College Navigator" href="http://nces.ed.gov/collegenavigator/opensearch.ashx?md=0" /></head>\r\n<body id="ctl00_bodyMain" onload="if(typeof imgPL==\'function\')imgPL();if(typeof sp==\'function\')sp();"><div class="headC

In [582]:
page = response.text

In [584]:
soup = BeautifulSoup(page, "lxml")


#### Initial table at top of institution's page
- Institution's name
- Student-to-faculty ratio

In [668]:
private_university_string = soup.find('title').text
private_university = private_university_string.split('-')[1].strip()
print(private_university)

Abilene Christian University


In [592]:
stu_fac_ratio_str = soup.find('table', class_='layouttab').find_all('td')[-1].text
print(stu_fac_ratio_str)
stu_fac_ratio_ls = stu_fac_ratio.split()
stu_fac_ratio_ls.remove('to')
stu_fac_ratio_ls = [int(x) for x in stu_fac_ratio_ls]
stu_fac_ratio_ls
stu_fac_ratio_dec = stu_fac_ratio_ls[0] / stu_fac_ratio_ls[1]
print(stu_fac_ratio_dec)

13 to 1
13.0


#### 'TUITION, FEES AND ESTIMATED STUDENT EXPENSES' collapsable table
- tuition and fees in 2019-20
- books and supplies in 2019-20
- tuition and fees in 2020-21
- books and supplies in 2020-21

In [593]:
tuition_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl00')
tuition_rows = [row for row in tuition_table.find_all('tr')]
tuition_rows[1]

<tr><td scope="row">Tuition and fees</td><td>$33,330</td><td>$34,850</td><td>$36,300</td><td>$37,800</td><td>4.1%</td></tr>

In [650]:
tuition_text = tuition_rows[1].find_all('td')[0].text
books_text = tuition_rows[2].find_all('td')[0].text
tuition_19_20 = int(tuition_rows[1].find_all('td')[3].text.replace('$','').replace(',',''))
print(tuition_19_20)
books_19_20 = int(tuition_rows[2].find_all('td')[3].text.replace('$','').replace(',',''))
print(books_19_20)
tuition = int(tuition_rows[1].find_all('td')[4].text.replace('$','').replace(',',''))
print(tuition)
books = int(tuition_rows[2].find_all('td')[4].text.replace('$','').replace(',',''))
print(books)

36300
1250
37800
1250


#### 'GENERAL INFORMATION' collapsable table'
- number of full-time faculty
- number part-time faculty

In [594]:
faculty_table = soup.find(id='ctl00_cphCollegeNavBody_ucInstitutionMain_divFaculty')
faculty_table = [row for row in faculty_table.find_all('tr')]
faculty_table[1]

<tr><td>Total faculty</td><td>277</td><td>137</td></tr>

In [602]:
ft_faculty = int(faculty_table[1].find_all('td')[1].text)
pt_faculty = int(faculty_table[1].find_all('td')[2].text)

#### 'FINANCIAL AID' collapsable table - ONLY 2019-20 DATA
- number of all undergraduate students receiving grant or scholarship aid
- average amount of grant or scholarship aid among all undergraduate students
- number of all undergraduate students receiving Pell grants
- average amount of Pell grants aid received among all undergraduate students

In [647]:
aid_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl01')
aid_rows = [row for row in aid_table.find_all('tr')]
aid_rows[13].find_all('td')

[<td scope="row">Pell grants</td>,
 <td>899</td>,
 <td>$4,226,334</td>,
 <td>$4,701</td>]

In [649]:
grant_scholarship_aid_19_20 = int(aid_rows[12].find_all('td')[1].text.replace(',',''))
print(grant_scholarship_aid_19_20)
avg_amt_grant_scholarship_aid_19_20 = int(aid_rows[12].find_all('td')[3].text.replace('$','').replace(',',''))
print(avg_amt_grant_scholarship_aid_19_20)
pell_aid_19_20 = int(aid_rows[13].find_all('td')[1].text.replace(',',''))
print(pell_aid_19_20)
avg_amt_pell_aid_19_20 = int(aid_rows[13].find_all('td')[3].text.replace('$','').replace(',',''))
print(avg_amt_pell_aid_19_20)

3436
21092
899
4701


#### 'ENROLLMENT' collapsable table
- total enrollment
- undergraduate enrollment
- undergraduate transfer-in enrollment
- graduate enrollment

In [617]:
enroll_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl03')
enroll_rows = [row for row in enroll_table.find_all('tr')]
enroll_rows[0]

<tr><th scope="col">Total enrollment</th><th scope="col">5,291</th></tr>

In [622]:
total_enroll = int(enroll_rows[0].find_all('th')[1].text.replace(',',''))
print(total_enroll)
under_enroll = int(enroll_rows[1].find_all('td')[1].text.replace(',',''))
print(under_enroll)
under_trans_enroll = int(enroll_rows[2].find_all('td')[1].text.replace(',',''))
print(under_trans_enroll)
grad_enroll = int(enroll_rows[3].find_all('td')[1].text.replace(',',''))
print(grad_enroll)

5291
3494
143
1797


#### 'ADMISSIONS' collapsable table - **NEED TO MODIFY THIS CODE FOR SCHOOLS LIKE AMERICA EVANGELICAL UNIVERSITY THAT DON'T HAVE ADMISSIONS DATA BECAUSE THEY HAVE AN OPEN ADMISSION POLICY**
- number of undergraduate applicants
- percent of undergraduate applicants admitted
- number of first-time students who submitted their SAT score
- % of first-time students who submitted their SAT score
- number of first-time students who submitted their ACT score
- % of first-time students who submitted their ACT score
- SAT Evidence-Based Reading and Writing 25th percentile score
- SAT Evidence-Based Reading and Writing 75th percentile score
- SAT Math 25th percentile score
- SAT Math 75th percentile score
- ACT Composite 25th percentile score
- ACT Composite 75th percentile score
- ACT English 25th percentile score
- ACT English 75th percentile score
- ACT Math 25th percentile score
- ACT Math 75th percentile score

In [659]:
admissions_table = soup.find(id='divctl00_cphCollegeNavBody_ucInstitutionMain_ctl04')

In [694]:
admissions_rows = [row for row in admissions_table.find_all('tr')]

In [705]:
soup.find(text='SAT Evidence-Based Reading and Writing').findNext().text

'510'

In [693]:
under_apps = int(admissions_rows[3].find_all('td')[1].text.replace(',',''))
print(applicants)
perc_admitted = float(admissions_rows[4].find_all('td')[2].text.replace('%',''))
print(perc_admitted)

10534
61.0


In [712]:
sat_scores_submit = int(soup.find(text='SAT').findNext().text)
print(sat_scores_submit)
perc_sat_scores_submit = float(soup.find(text='SAT').findNext().findNext().text.replace('%',''))
print(perc_sat_scores_submit)
act_scores_submit = int(soup.find(text='ACT').findNext().text)
print(act_scores_submit)
perc_act_scores_submit = float(soup.find(text='ACT').findNext().findNext().text.replace('%',''))
print(perc_act_scores_submit)
sat_read_write_25_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().text)
print(sat_read_write_25_perc)
sat_read_write_75_perc = int(soup.find(text='SAT Evidence-Based Reading and Writing').findNext().findNext().text)
print(sat_read_write_75_perc)
sat_math_25_perc = int(soup.find(text='SAT Math').findNext().text)
print(sat_math_25_perc)
sat_math_75_perc = int(soup.find(text='SAT Math').findNext().findNext().text)
print(sat_math_75_perc)
act_comp_25_perc = int(soup.find(text='ACT Composite').findNext().text)
print(act_comp_25_perc)
act_comp_75_perc = int(soup.find(text='ACT Composite').findNext().findNext().text)
print(act_comp_75_perc)
act_eng_25_perc = int(soup.find(text='ACT English').findNext().text)
print(act_eng_25_perc)
act_eng_75_perc = int(soup.find(text='ACT English').findNext().findNext().text)
print(act_eng_75_perc)
act_math_25_perc = int(soup.find(text='ACT Math').findNext().text)
print(act_math_25_perc)
act_math_75_perc = int(soup.find(text='ACT Math').findNext().findNext().text)
print(act_math_75_perc)

432
50.0
369
43.0
510
620
500
600
21
27
20
28
18
26


### Private non-profit colleges/university by region

In [379]:
# URLs of lists of schools by region
    # Far West
url_fw = 'https://nces.ed.gov/collegenavigator/?s=AK+CA+HI+NV+OR+WA&l=93&ct=2'

    # Rocky Mountains
url_rm = 'https://nces.ed.gov/collegenavigator/?s=CO+ID+MT+UT+WY&l=93&ct=2'

    # Southwest
url_sw = 'https://nces.ed.gov/collegenavigator/?s=AZ+NM+OK+TX&l=93&ct=2'

    # Plains
url_p = 'https://nces.ed.gov/collegenavigator/?s=IA+KS+MN+MO+NE+ND+SD&l=93&ct=2'

    # Great Lakes
url_gl = 'https://nces.ed.gov/collegenavigator/?s=IL+IN+MI+OH+WI&l=93&ct=2'

    # Southeast
url_se = 'https://nces.ed.gov/collegenavigator/?s=AL+AR+FL+GA+KY+LA+MS+NC+SC+TN+VA+WV&l=93&ct=2'
   
    # Mideast (including DC)
url_me = 'https://nces.ed.gov/collegenavigator/?s=DE+DC+MD+NJ+NY+PA&l=93&ct=2'
    
    # New England
url_ne = 'https://nces.ed.gov/collegenavigator/?s=CT+ME+MA+NH+RI+VT&l=93&ct=2'

In [504]:
# create a list of the url of each region's initial page of private schools

regions_urls = ['https://nces.ed.gov/collegenavigator/?s=AK+CA+HI+NV+OR+WA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CO+ID+MT+UT+WY&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AZ+NM+OK+TX&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IA+KS+MN+MO+NE+ND+SD&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=IL+IN+MI+OH+WI&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=AL+AR+FL+GA+KY+LA+MS+NC+SC+TN+VA+WV&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=DE+DC+MD+NJ+NY+PA&l=93&ct=2',
               'https://nces.ed.gov/collegenavigator/?s=CT+ME+MA+NH+RI+VT&l=93&ct=2']                

In [427]:
# loop through each url of each region's initial page of private schools to get the count of the # of pages' 
    # worth of schools each region has

pages_regions = []

for url_reg in regions_urls:
    response_reg = requests.get(url_reg)
    soup_reg = BeautifulSoup(response_reg.text, 'html5lib')
    pages_str = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_divPagingControls').find_all('a')[-1]['href']
    pages_idx = pages_str.find('&pg=')
    pages = pages_str[pages_idx+4:]
    pages_regions.append(int(pages))

In [505]:
# loop through the list of initial region urls and each region's corresponding number of pages' worth of schools to
    # create a new url for each region's page(s) and add the new url to the original list of initial region urls

for idx in range(0, len(regions_urls)):
    for i in range(2, pages_regions[idx]+1):
        new_url = (''.join(regions_urls[idx] + '&pg=' + str(i)))
        regions_urls.append(new_url)

In [726]:
# loop through each region url to create a dictionary of all school names and their corresponding url

schools = {}
names = []
urls = []

for reg_url in regions_urls:
    response_reg = requests.get(reg_url)
    soup_reg = BeautifulSoup(response_reg.text, "lxml")
    table_reg = soup_reg.find(id='ctl00_cphCollegeNavBody_ucResultsMain_tblResults')
    rows_reg = [row for row in table_reg.find_all('tr')]

    for row in rows_reg:
        link = row.find('a')
        if link != None:
            name, url = link.text, link['href']
            names.append(name)
            urls.append(url)
            for idx in range(1, len(names)):
                if names[idx] == names[idx-1]:
                    names[idx] = names[idx] + '_b'

for a, b in zip(names, urls):
    schools[a] = [b]

In [727]:
# loop through the new dictionary of school names and urls to add beginning part of nces' website to the urls

for key, val in schools.items():
    schools[key] = 'https://nces.ed.gov/collegenavigator/' + val

TypeError: can only concatenate str (not "list") to str

In [728]:
# check first five key, value pairs in schools dict

{k: schools[k] for k in list(schools)[:5]}

{'Alaska Bible College': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580'],
 'Alaska Pacific University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669'],
 'America Evangelical University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081'],
 'American Jewish University': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846'],
 'Antioch University-Los Angeles': ['?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838']}

In [729]:
schools_df = pd.DataFrame(schools).T  #transpose
schools_df.columns = ['link_stub']

schools_df.head()

Unnamed: 0,link_stub
Alaska Bible College,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102580
Alaska Pacific University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=102669
America Evangelical University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=490081
American Jewish University,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=116846
Antioch University-Los Angeles,?s=AK+CA+HI+NV+OR+WA&l=93&ct=2&id=245838


In [None]:
def get_school_dict(link):
    '''
    From the National Center for Educaiton Statistics link stub, request school html, parse with BeautifulSoup, and
    collect 
        - school name 
        - student-to-faculty ratio
        - tuition and fees in 2019-20
        - books and supplies in 2019-20
        - tuition and fees in 2020-21
        - books and supplies in 2020-21
        - number of full-time faculty
        - number part-time faculty
        - 
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date]))

    return movie_dict

In [574]:
# loop through each school's page to extract the student-to-faculty ratio and save the value in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [576]:
#stu_fac_ratio_ls[]

1332

In [577]:
#len(schools)

1332

In [None]:
#time.sleep(3)

In [None]:
# loop through each school's page to grab the html from that page and save it in a list

#stu_fac_ratio_ls = []

#for school_url in schools.values():
   # response_school = requests.get(school_url)
   # soup_school = BeautifulSoup(response_school.text, "lxml")
   # stu_fac_ratio = soup_school.find('table', class_='layouttab').find_all('td')[-1].text
   # stu_fac_ratio_ls.append(stu_fac_ratio)
   # time.sleep(3)

In [None]:
# code for looping through one state's url and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_string = 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2'

#for abrev in state_abrev:
   # ca_list = list(ca_string)
   # idx_1 = ca_list.index('=')
   # idx_2 = ca_list.index('&')
    #print(''.join(ca_list[:idx_1+1]) + abrev + ''.join(ca_list[idx_2:]))

In [523]:
# code for looping through the multiple pages of a state's urls and replacing the state with a new one

#state_abrev = ['TX', 'NY', 'FL']
#ca_schools_search_tables_urls = [
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=2',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=3',
   # 'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=4',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=5',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=6',
    #'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=7',
  #  'https://nces.ed.gov/collegenavigator/?s=CA&l=93&ct=2&pg=8']

#for abrev in state_abrev:
  #  for url in ca_schools_search_tables_urls:
   #     idx_1 = url.index('=')
   #     idx_2 = url.index('&')
  #      print(''.join(url[:idx_1+1]) + abrev + ''.join(url[idx_2:]))