In [16]:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
import re
import spacy

In [17]:
service = Service()
options = webdriver.ChromeOptions()
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
options.add_argument("--window-size=1920,1080")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
driver = webdriver.Chrome(service=service, options=options)

query = 'uiuc'

url = f'https://www.usnews.com/search/education#gsc.tab=0&gsc.q={query}%20academics%20majors&gsc.sort='
driver.get(url)

try:
    elem = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.TAG_NAME, 'li')))
    content = driver.find_elements(By.TAG_NAME, 'li')
    print('Content found')
except:
    print('Website timed out')

Content found


In [18]:
result = ''
for c in content:
    if 'https://www.usnews.com/best-colleges/' in c.text:
        result = c.text
        break
result

'University of Illinois Urbana-Champaign Academics & Majors - US ...\nhttps://www.usnews.com/best-colleges/university-of-illinois-urbanachampaign-1775/academics\nSee the most popular majors at University of Illinois Urbana-Champaign and learn about available academic programs and class sizes.'

In [19]:
url = re.findall('https?://\S+', result)[0]
url

'https://www.usnews.com/best-colleges/university-of-illinois-urbanachampaign-1775/academics'

In [20]:
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'lxml')

In [21]:
a = soup.find('h1', class_='Heading-sc-1w5xk2o-0 cSCNqo')
text = a.find_next_sibling('p').text
text

'The student-faculty ratio at University of Illinois Urbana-Champaign is 21:1, and the school  has 36.3% of its classes with fewer than 20 students. The most popular majors at University of Illinois Urbana-Champaign include: Engineering; Business, Management, Marketing, and Related Support Services; Social Sciences; Psychology; Communication, Journalism, and Related Programs; Mathematics and Statistics; Agricultural/Animal/Plant/Veterinary Science and Related Fields; Biological and Biomedical Sciences; Computer and Information Sciences and Support Services; and Health Professions and Related Programs. The average freshman retention rate, an indicator of student satisfaction, is 94%.'

In [22]:
majors = text.split('include:')[1].split('. The average')[0].strip().split(';')
if majors[-1][:4] == ' and':
    majors[-1] = majors[-1][4:]
majors = [m.strip() for m in majors]
majors

['Engineering',
 'Business, Management, Marketing, and Related Support Services',
 'Social Sciences',
 'Psychology',
 'Communication, Journalism, and Related Programs',
 'Mathematics and Statistics',
 'Agricultural/Animal/Plant/Veterinary Science and Related Fields',
 'Biological and Biomedical Sciences',
 'Computer and Information Sciences and Support Services',
 'Health Professions and Related Programs']

In [23]:
final_majors = []
for major_str in majors:
    lower = major_str.lower()
    if 'and related' in lower:
        i = lower.index('and related')
        major_str = major_str[:i].strip()
            
    if 'and support' in lower:
        i = lower.index('and support')
        major_str = major_str[:i].strip()
        
    if ',' in major_str and 'and' not in lower and 'studies' not in lower:
        lst = major_str.split(',')
        for m in lst:
            if m and 'and related' not in m.lower():
                final_majors.append(m.strip())
    
    else:
        final_majors.append(major_str)
        
final_majors

['Engineering',
 'Business, Management, Marketing,',
 'Social Sciences',
 'Psychology',
 'Communication, Journalism,',
 'Mathematics and Statistics',
 'Agricultural/Animal/Plant/Veterinary Science',
 'Biological and Biomedical Sciences',
 'Computer and Information Sciences',
 'Health Professions']

In [24]:
from rake_nltk import Rake

In [25]:
r = Rake()
major_keywords = []
for fm in final_majors:
    r.extract_keywords_from_text(fm)
    keywords = r.get_ranked_phrases()
    words = ['science', 'studies']
    for w in words:
        if len(keywords) > 1 and w in keywords[0] and w not in keywords[1:]:
            for i in range(1, len(keywords)):
                keywords[i] = keywords[i] + ' ' + w
    major_keywords.append(keywords)#', '.join(keywords))
    

major_keywords

[['engineering'],
 ['marketing', 'management', 'business'],
 ['social sciences'],
 ['psychology'],
 ['journalism', 'communication'],
 ['statistics', 'mathematics'],
 ['veterinary science',
  'plant science',
  'animal science',
  'agricultural science'],
 ['biomedical sciences', 'biological science'],
 ['information sciences', 'computer science'],
 ['health professions']]

In [26]:
nlp = spacy.load('en_core_web_md')

In [27]:
def similarity(word1, word2):
    token1 = nlp(word1)[0]
    token2 = nlp(word2)[0]
    return token1.similarity(token2)

In [28]:
possible_majors = ['accounting', 'agricultural science', 'anthropology', 'architecture', 'art', 'biology', 'business', 'chemistry', 'communications', 'computer science', 'criminal justice', 'culinary arts', 
                   'dental studies', 'design', 'economics', 'education', 'engineering', 'english', 'environmental science', 'film', 'finance', 'foreign language', 'history', 'information science', 'kinesiology', 
                   'law', 'math', 'music', 'nursing', 'nutrition', 'performing arts', 'pharmacy', 'philosophy', 'physics', 'political science', 'psychology', 'religion', 'sociology', 'statistics']

In [29]:
majors_to_search = set()
for mks in major_keywords:
    for mk in mks:
        for pm in possible_majors:
            if similarity(pm, mk) > 0.7:
                majors_to_search.add(mk)
                
majors_to_search

{'agricultural science',
 'biological science',
 'biomedical sciences',
 'business',
 'communication',
 'computer science',
 'engineering',
 'information sciences',
 'management',
 'marketing',
 'mathematics',
 'psychology',
 'statistics'}