In [1]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import dash
import dash_table
import time
from fake_useragent import UserAgent


In [6]:
#constants
#headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 10; ONEPLUS A5010) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
#headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 OPR/77.0.4054.277'}
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
base_url = "https://www.niche.com/colleges/"
#filename = "./Grad"      #without .csv!
filename = "./Grad"
ua = UserAgent()

In [7]:
#----------Load and translate ISEP CSV table------------------
def loadNameList(filename):
    df = pd.read_csv(filename + ".csv")
    return list(df["University Name"]), list(df[" Chance of Placement"])
    
def namelistToURL(namelist):
    url_list = []
    for uni_name in namelist:
        element = uni_name.replace(" ", "-")
        element = element.replace("&", "-and-")
        element = element.replace("â€™", "")
        url_list.append(element)
        
    return url_list

#-------------------------
def getSoup(uni_url):
    url = base_url + uni_url + "/"

    # Make a GET request to fetch the raw HTML content
    #user_agent = ua.random
    #headers = {'User-Agent': user_agent}
    html_content = requests.get(url, headers=headers).text


    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
    
    return soup

#---------Get Table Content---------------  
def getOverallGrade(soup):
    try:
        mydivs = soup.find_all("div", {"class": "overall-grade__niche-grade"})
        grade = mydivs[0].contents[0].contents[1]
        grade = grade.replace(" minus", "-")
    except:
        grade = "Error"
    return grade

def getNameList(soup):
    labels = mydivs[0].contents[0].find_all("div", {"class": "profile-grade__label"})
    label_list = []

    for label in labels:
        label_list.append(label.contents[0])
        
    return label_list
    

def getValueList(soup):
    mydivs = soup.find_all("div", {"class": "profile__bucket--2"})
    #print(soup.prettify())
    values = mydivs[0].contents[0].find_all("div", {"class": "profile-grade--two"})
    value_list = []

    for value in values:
        element = value.contents[1].contents[1]
        element = element.replace(" minus", "-")
        value_list.append(element)
        
    return value_list

def foundSite(soup):
    if bool(soup.findAll(text="Access to this page has been denied.")):
        print(soup.prettify())
        #print("Access denied")
        return False
            
    if bool(soup.findAll(text="Page Not Found")):
        return False
    
    return True

def getStudentCount(soup):
    try:
        mydivs = soup.find_all("section", {"aria-label": "Students"})
        res = mydivs[0].contents[1].contents[0].contents[0].contents[0].contents[1].contents[0].contents[0]
        res = res.replace(",","")
        res = int(res)
    except:
        res = "Error"
        
    return res

def displayInteractiveTable(df):
    app = dash.Dash(__name__)

    app.layout = dash_table.DataTable(
        id='table',
        columns=[{"name": i, "id": i} for i in df.columns],
        data=df.to_dict('records'),
        filter_action="native",
        sort_action="native",
        sort_mode="multi",
        row_deletable=True,
        editable=True
    )

    app.run_server( "127.0.0.1", 8000,debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter
    #app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter

In [8]:

name_list, placement_list = loadNameList(filename)
url_list = namelistToURL(name_list)

#setup data lists
niche_score_list = []
report_scores = []
student_count = []

for i, uni_url in enumerate(url_list):
    print(str(i) + "/" + str(len(name_list)) + ": "+uni_url)
    soup = getSoup(uni_url)
    
    #if not(foundSite(soup)):
    #    new_uni_url = uni_url.replace("-at", "")
    #    soup = getSoup(new_uni_url)
    #    time.sleep(5)
    #if not(foundSite(soup)):
    #    new_uni_url = uni_url.replace(",", "-")
    #    soup = getSoup(new_uni_url)
    #    time.sleep(5)
    #if not(foundSite(soup)):
    #    new_uni_url = uni_url.replace("&", "-and-")
    #    soup = getSoup(new_uni_url)
    #    time.sleep(5)
    if not(foundSite(soup)):
        print("Page not found, invalid URL: " + uni_url)
        niche_score_list.append(None)
        report_scores.append([None, None, None, None, None, None,None, None, None, None, None, None])
        student_count.append(None)
    else:
        niche_score_list.append(getOverallGrade(soup))
        report_scores.append(getValueList(soup))
        student_count.append(getStudentCount(soup))
    
    time.sleep(5)

        


row_list = []
for i, name in enumerate(name_list):
    row = []
    row.append(name)
    row.append(student_count[i])
    row.append(placement_list[i])
    row.append(niche_score_list[i])
    [row.append(item) for item in report_scores[i]]
    row_list.append(row)   
    

    
names = ["University Name", "Student Count", "Chances of Placement","Overall Niche Grade",
        "Academics","Value", "Diversity", "Campus", "Athletics", "Party Scene", "Professors", "Location", "Dorms", "Campus Food", "Student Life", "Safety"]
df = pd.DataFrame(row_list, columns = names)

df.to_csv(filename + "_result" + ".csv")  # where to save it, usually as a .pkl

KeyError: ' Chance of Placement'

In [None]:
#undergrad_1 = pd.read_csv("./Undergrad_1_result.csv")
grad = pd.read_csv("./Grad.csv")
grad["Ranking"] = None
displayInteractiveTable(grad)

Dash is running on http://127.0.0.1:8000/

Dash is running on http://127.0.0.1:8000/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
