In [1]:
# the intention of this script is to retrieve each university out of "College Rankings __ Attractiveness of the Students" + num + ".html" and store it in a list
# the list will be used to create an dataframe with the university and attractiveness
# the script will then find the state AND POPULATION of each university and store it with the university and attractiveness
# the script will then calculate each state's average attractiveness and store it in a dataframe
# this dataframe will be used to create a heatmap of the US with the average attractiveness of each state

In [2]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
universities = []
attractiveness = []

# retrieve the universities and attractiveness score from each html file

for i in range(1, 7):
    with open("College Rankings __ Attractiveness of the Students" + str(i) + ".html") as f:
        soup = BeautifulSoup(f, 'html.parser')
        universitiesBS = soup.find_all('a', class_='ratings_list')
        for university in universitiesBS:
            universities.append(university.string)
        attractivenessBS = soup.find_all('div', align='right')
        for attr in attractivenessBS:
            attractiveness.append(attr.string)


# clean the attractiveness list
# remove all the None values
attractiveness = [x for x in attractiveness if x is not None]

# remove all the elements which start with S, as they are not floats
attractiveness = [x for x in attractiveness if not x.startswith('S')]

#print(universities)
#print(attractiveness)

print(len(universities))
print(len(attractiveness))


505
505


In [4]:
# create a dataframe with the universities and their attractiveness
df = pd.DataFrame({"University": universities, "Attractiveness": attractiveness})

# replace "Malone College" with "Malone University" as it is now known
df.loc[df["University"] == "Malone College", "University"] = "Malone University"

# print the head and tail of the dataframe by combining them and printing them
pd.concat([df.head(), df.tail()])

Unnamed: 0,University,Attractiveness
0,Pepperdine University,89.7
1,Villanova University,87.8
2,Vanderbilt University,87.5
3,Texas Christian University,87.2
4,Southern Methodist University,87.0
500,Worcester Polytechnic Institute,28.2
501,"University of Missouri, Rolla",27.5
502,Polytechnic University,26.7
503,Rose-Hulman Institute of Technology,26.4
504,Clarkson University,24.1


In [5]:

# list of all the states in the US for simple regex matching (taken from https://gist.github.com/norcal82/e4c7e8113f377db184bb)
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]


In [71]:
def find_population_from_wiki(wiki_page_soup: BeautifulSoup):

    # get the table with the state and population
    table = None
    rows = None
    try:
        table = wiki_page_soup.find('table', class_='infobox vcard')
        rows = table.find_all('tr')
    except:
        print("No table found")
        return None

    # iterate through the rows and find the population
    for row in rows:

        if len(row.find_all('th')) != 0:
            #print(row.find_all('th'))
            if row.find_all('th')[0].string == "Population" or row.find_all('th')[0].string == "Students":
                return int(re.search(r'\d{1,3}(,\d{3})*', row.find_all('td')[0].get_text()).group().replace(",", ""))
            if row.find_all('th')[0].string == "Undergraduates":
                print(re.search(r'\d{1,3}(,\d{3})*', row.find_all('td')[0].get_text()).group())
                return int(int(re.search(r'\d{1,3}(,\d{3})*', row.find_all('td')[0].get_text()).group().replace(",", ""))*1.1)
    print("Population not found")
    return None

    

In [90]:
# create a list of the states of the universities
state = []
population = []

# create a list of the universities that couldn't be found
not_found = []

for i in range(len(df["University"])):
    print(df["University"][i])
    # check to see if one of the words in the university name is a state
    for word in df["University"][i].split():
        if word in state_names:
            state.append(word)
            break
        elif word == "SUNY" or word == "CUNY":
            state.append("New York")
            break
        elif word == "Rhode":
            state.append("Rhode Island")
            break
        else:
            state.append(None)

    url = "https://en.wikipedia.org/wiki/" + df["University"][i]
    # if the university has a space in its name, replace it with an underscore
    url = re.sub(r" ", "_", url)
    #print(url)
    # if the university has a parenthesis or a comma in its name, remove it along with the rest of the name
    url = re.sub(r"_\(.*", "", url)
    #url = re.sub(r",.*", "", url)
    # if the university has the words "Main Campus" in its name, remove it
    url = re.sub(r",_Main_Campus", "", url)

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # if the page displays: "Wikipedia does not have an article with this exact name", then the university was not found
    if len(soup.find_all("table", id="noarticletext")) > 0:
        url = "https://en.wikipedia.org/wiki/" + df["University"][i]
        # if the university has a space in its name, replace it with an underscore
        url = re.sub(r" ", "_", url)
        #print(url)
        # if the university has a parenthesis or a comma in its name, remove it along with the rest of the name
        url = re.sub(r"_\(.*", "", url)
        url = re.sub(r",.*", "", url)
        # if the university has the words "Main Campus" in its name, remove it
        url = re.sub(r"Main_Campus", "", url)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
    
    print(url)
        

    population.append(find_population_from_wiki(soup))
    print(population[-1])
    
    if state[-1] == None:
        print("State not found")
        # remove the None value from the very end of the list and continue to use wikipedia
        state = state[:-1]
    else:
        continue

    
    try:
        state.append(soup.find_all("div", class_="state")[0].string)
    except:
        try:
            state[-1] = soup.find_all("div", class_="locality")[0].string
            # if there's a comma+space in the locality, remove it along with everything before it
            state[-1] = re.sub(r".*, ", "", state[-1])
            print(str(int(i/len(df["University"])*1000)/10.0) + "%")
        except:
            state.append("N/A")
            population[-1] = "0"
            print("N/A: " + url)
            not_found.append(df["University"][i])

Pepperdine University
https://en.wikipedia.org/wiki/Pepperdine_University
10030
State not found
None


AttributeError: 'NoneType' object has no attribute 'append'

In [88]:
len(population)

11

In [89]:
print(len(state))
print(state)

21
['Pennsylvania', 'Tennessee', 'Texas', None, 'Texas', 'Arizona', None, 'Ohio', None, None, 'Mississippi', None, 'South Carolina', None, None, None, 'Florida', None, None, None, 'California']
