In [1]:
import spacy

In [2]:
#!python -m spacy download en_core_web_md

In [3]:
nlp = spacy.load('en_core_web_md')

In [4]:
def similarity(word1, word2):
    token1 = nlp(word1)[0]
    token2 = nlp(word2)[0]
    return token1.similarity(token2)

In [5]:
similarity('biology', 'life science')

0.3035752773284912

In [6]:
from bs4 import BeautifulSoup
import requests
from trafilatura import fetch_url, extract
from trafilatura.settings import use_config

In [7]:
from keys import API_KEY, SEARCH_ENGINE_ID

In [8]:
def get_urls_and_titles(query):
        urls = []
        titles = []
        page = 1
        start = (page - 1) * 10 + 1
        url = f"https://www.googleapis.com/customsearch/v1?key={API_KEY}&cx={SEARCH_ENGINE_ID}&q={query}&start={start}"
        data = requests.get(url).json()
        search_items = data.get("items")
        for search_item in search_items:
            title = search_item.get("title")
            link = search_item.get("link")
            urls.append(link)
            titles.append(title)
        return [urls, titles]

def extract_paragraphs(url):
    try:    
        config = use_config()
        config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
        downloaded = fetch_url(url)
        result = extract(downloaded, config=config, output_format='xml',
                        include_links=True, include_formatting=True)
    except:
        return []
    if result is None:
        return []

    soup = BeautifulSoup(result, 'lxml')
    paragraphs = []
    for p in soup.find_all('p'):
        text = p.get_text(strip=True, separator='\n')
        if '.' in text:
            paragraphs.append(text)
    return paragraphs


In [26]:
name = 'UIUC'
query = f'what majors is {name} known for'
urls = get_urls_and_titles(query)[0]
raw_text = ''
for url in urls:
    print(url)
    paragraphs = extract_paragraphs(url)
    raw_text += ''.join(paragraphs)

https://www.usnews.com/best-colleges/university-of-illinois-urbanachampaign-1775/academics
https://illinois.edu/about/rankings_ug.html
https://www.collegeraptor.com/colleges/majors/University-of-Illinois-Urbana-Champaign-IL--145637
https://myillini.illinois.edu/Programs
https://www.quora.com/What-is-the-most-popular-major-at-the-University-of-Illinois-Why
https://oneclass.com/blog/university-of-illinois/5912-the-10-most-popular-majors-at-uiuc.en.html
https://giesbusiness.illinois.edu/undergraduate-hub/majors-and-minors
https://las.illinois.edu/admissions/exploremajors
https://english.illinois.edu/academics/undergraduate-studies/creative-writing-major
http://catalog.illinois.edu/undergraduate/


In [27]:
raw_text

'2022-23 U.S. News America\'s Best CollegesClick on the pin icon to save a school to your preferred college list (or click\nagain to remove from the favorites). Click on the X to exclude a college permanently\nfrom your search results.A break down of estimated costs to attend this institution.Estimated Sticker Prices for this college based on published data, adjusted for\nestimated inflation. This includes tuition, room and board, books, travel, and other\neducational expenses. Travel costs will vary significantly based on the residency\nlocation of the student.Your estimated net cost of attendance at this institution per year based on the\ndata you provided. This is the estimated cost of attendance minus grants, scholarships, and merit aid.An estimation of your total debt level upon graduation, calculated based on your\nnet price over four years minus $0\nper year that you indicated your family can afford to pay out of pocket. This figure\nis furher adjusted for estimated interest acc

In [29]:
majors = ['business', 'nursing', 'psychology', 'biology', 'engineering', 'education', 'communications', 'finance', 'accounting',
          'criminal justice', 'anthropology', 'sociology', 'computer science', 'english', 'economics', 'political science', 'history',
          'kinesiology', 'art', 'math', 'environmental science', 'foreign language', 'design', 'chemistry', 'agricultural science',
          'information technology', 'performing arts', 'nutrition', 'religion', 'film', 'music', 'physics', 'philosophy', 'architecture',
          'law', 'culinary arts', 'pharmacy', 'dental studies']
len(majors)

38

In [30]:
from eric_chen_forward import util

In [31]:
cleaned_text = util.clean_document(raw_text)
cleaned_text

'u news america best collegesclick pin icon save school preferred college list click remove favorite click x exclude college permanently search result break estimated cost attend institution estimated sticker price college based published data adjusted estimated inflation includes tuition room board book travel educational expense travel cost vary significantly based residency location student estimated net cost attendance institution per year based data provided estimated cost attendance minus grant scholarship merit aid estimation total debt level upon graduation calculated based net price four year minus per year indicated family afford pay pocket figure furher adjusted estimated interest accrual college assumes participate work study program academic year estimation monthly payment due student loan upon graduation based year loan period full cost base tuition institution sticker price institution full cost attendance including tuition room board book travel expense pell grant schol

In [27]:
result = {}
for major in majors[:10]:
    for word in cleaned_text.split():
        s = similarity(major, word)
        if s > 0.6:
            if major not in result:
                result[major] = s
            elif s > result[major]:
                result[major] = s

  return token1.similarity(token2)


In [31]:
dict(sorted(result.items(), key=lambda x:x[1], reverse=True))

{'psychology': 1.0000001192092896,
 'business': 1.0,
 'biology': 1.0,
 'engineering': 1.0,
 'education': 1.0,
 'accounting': 1.0,
 'finance': 0.8313702940940857,
 'communications': 0.7950165867805481,
 'nursing': 0.7172186374664307,
 'criminal justice': 0.6745493412017822}

In [14]:
from rake_nltk import Rake

In [32]:
r = Rake()
r.extract_keywords_from_text(raw_text)

In [35]:
keywords = r.get_ranked_phrases()
cleaned_keyword_text = util.clean_document(' '.join(keywords))
result_text = ' '.join(set(cleaned_keyword_text.split()))
result_text

'highlighting animal latvia conduct identifying backdrop astronomical atmosphere thinking observed dark programme serve last essay advertising developer policy access break buyer existed edge rock metabolic analyst received geologist ability member mathematics homeland easily writer center connection exist continuing midwest inc freshman wherever joining stem warrenville featuring cell municipal self danielle religion film knowledge federal mass internationally independent local earth chase assistantships comparative tectonic resume leader reading expand equipped matter chemonics founder developing secured foreign discovering loan g name deere urban algorithm processing exceed sam ecosystem health treasury meetups interpret sherwin camping thus practice heinz conflict interaction cargill story bolinao stegner ivan income biochemistry enroll rush america see afford wheaton eventually quid creative stanford broad postmodernity alongside put incredibly discovered bring loaf composition cl

In [36]:
result = {}
for major in majors[:20]:
    for word in result_text.split():
        s = similarity(major, word)
        if s > 0.6:
            if major not in result:
                result[major] = s
            elif s > result[major]:
                result[major] = s

  return token1.similarity(token2)


In [38]:
dict(sorted(result.items(), key=lambda x:x[1], reverse=True))

{'criminal justice': 1.000000238418579,
 'psychology': 1.0000001192092896,
 'anthropology': 1.0000001192092896,
 'sociology': 1.0000001192092896,
 'business': 1.0,
 'biology': 1.0,
 'engineering': 1.0,
 'education': 1.0,
 'finance': 1.0,
 'computer science': 1.0,
 'english': 1.0,
 'economics': 1.0,
 'political science': 1.0,
 'history': 1.0,
 'art': 1.0,
 'math': 1.0,
 'accounting': 0.9999998807907104,
 'kinesiology': 0.9999998807907104,
 'communications': 0.9553452730178833,
 'nursing': 0.7172186374664307}