# BYU Graduate Program Analysis

BYU posts data about each of it's graduate programs online. Which programs are the hardest to get into? Which programs have the highest test scores or the lowest? Which programs are the longest? Let's scrape the data and then do some analysis. 

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

### Scraping the data
For this project, we will use selenium to open a web browser and follow a couple simple input commands. We then will parse the HTML and retrieve the data.

In [32]:
#Urls
gradStudies = 'https://gradstudies.byu.edu'
programsUrl = 'https://gradstudies.byu.edu/programs'
#Initialize Web Driver
browser = webdriver.Chrome()

In [12]:
#Navigate to the list of all programs
browser.get(programsUrl)

#Process HTML
allProgHTML = browser.page_source
soup = BeautifulSoup(allProgHTML,'html.parser')

progPages = {
    'Name' : [],
    'Link' : []
}

programDiv = soup.find('div',attrs={'class' : 'catalog-list-item'})
while programDiv:
    #Find a program names and respective url
    link = programDiv.find('a')
    if link:
        progPages['Link'].append(gradStudies + str(link.get('href')))
        progPages['Name'].append(link.text.strip())
    
    #Iterate (Go to the next program)
    programDiv = programDiv.find_next_sibling()

In [33]:

#Dictionary to store the data
progData = {'Program':[]}
#Make the old dictionary into tuples
LinkName = zip(progPages['Link'],progPages['Name'])

#Visit each program page and get the data
for linkname in LinkName:
    browser.get(linkname[0])
    progData['Program'].append(linkname[1])
    
    #Parse HTML
    soup = BeautifulSoup(browser.page_source,'html.parser')
    table = soup.find('table',attrs={'class':'table table-hover'})
    while table:
        row = table.find('tr')
        while row:
            #Find the data
            column1 = row.find('td')
            if column1:
                factorName = column1.text.strip()
                factorValue = column1.find_next('td').text.strip()
            
                #Check if factor is already in dictionary
                try:
                    progData[factorName].append(factorValue)
                #If not, fill the rows above with Nones
                except KeyError:
                    numRows = len(progData['Program']) - 1
                    progData[factorName] = [None]*numRows + [factorValue]
            
            #Iterate
            row = row.find_next_sibling()
        table = table.find_next_sibling()
        
    #After completeing each program page, fill in any unused factors with Nones
    for key in progData.keys():
        if len(progData[key]) < len(progData['Program']):
            progData[key].append(None)
        
    

### Data Cleaning
What we just scraped is a dataset of the admission and program statistics of BYU graduate programs.

In [170]:
import pandas as pd
import numpy as np

In [171]:
#Turn the data into a dataframe and save it
gradStats = pd.DataFrame(progData)
gradStats.fillna(value=np.nan, inplace=True)
gradStats.to_csv('gradStatsBYU.csv')

In [172]:
gradStats.describe()

Unnamed: 0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,GMAT Analytical Percentile,GMAT Analytical Score,GMAT Composite Score,GMAT Integrated Percentile,...,GRE Verbal Percentile,GRE Verbal Score,Graduated Per Year,International,LDS,LSAT Composite Score,Non-White,Percent Admitted,Program,Total Students
count,86.0,86.0,86.0,88.0,85,85,6.0,6.0,6.0,6.0,...,70.0,70.0,88.0,85,85,1.0,85,88.0,89,85
unique,69.0,77.0,40.0,37.0,41,32,6.0,4.0,6.0,6.0,...,61.0,50.0,65.0,15,50,1.0,21,80.0,89,49
top,4.0,8.4,3.67,2.0,4,5,39.3,4.0,615.4,57.9,...,78.0,155.8,2.0,0,20,160.8,1,40.0,Law (JD),14
freq,4.0,2.0,6.0,5.0,7,8,1.0,2.0,1.0,1.0,...,3.0,3.0,4.0,22,5,1.0,13,3.0,1,5


A problem in being able to compare programs is that different subjects require different tests. The main exams are GRE, GMAT and LSAT. We will divide up the database accordingly.

In [173]:
programs = gradStats.Program

#Mask all programs with no GMAT data
mask = ~gradStats['GMAT Analytical Score'].isnull()
GMATProg = gradStats.loc[mask,:]

#Mask programs with no LSAT data
mask = ~gradStats['LSAT Composite Score'].isnull()
LSATProg = gradStats.loc[mask,:]

#Mask programs with no GRE data
mask = ~gradStats['GRE Analytical Score'].isnull()
GREProg = gradStats.loc[mask,:]

In [174]:
GREProg.describe()

Unnamed: 0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,GMAT Analytical Percentile,GMAT Analytical Score,GMAT Composite Score,GMAT Integrated Percentile,...,GRE Verbal Percentile,GRE Verbal Score,Graduated Per Year,International,LDS,LSAT Composite Score,Non-White,Percent Admitted,Program,Total Students
count,70.0,70.0,70.0,70.0,69,69,2.0,2.0,2.0,2.0,...,70.0,70.0,70.0,69,69,0.0,69,70.0,70,69
unique,55.0,64.0,36.0,34.0,31,28,2.0,2.0,2.0,2.0,...,61.0,50.0,54.0,13,40,0.0,18,66.0,70,40
top,2.0,16.0,3.63,2.2,13,6,49.3,4.3,540.5,57.9,...,78.0,155.8,2.0,0,20,,1,40.0,Microbiology and Molecular Biology (PhD),14
freq,3.0,2.0,5.0,5.0,6,7,1.0,1.0,1.0,1.0,...,3.0,3.0,3.0,20,5,,12,3.0,1,5


Now we will remove all columns that do not apply to the programs in each set, except for in the GRE programs, where one program does not include the typical data. (This can be seen from the summary above.) We will find this program and remove it before removing the other empty columns.

In [175]:
#Remove spurious columns
count = GMATProg.describe().loc['count']
GMATProg = GMATProg[GMATProg.columns[count == count['Admitted Per Year']]]

#Remove spurious columns
count =LSATProg.describe().loc['count']
LSATProg = LSATProg[LSATProg.columns[count == count['Admitted Per Year']]]

#Find offender
offender = GREProg.Program[GREProg.Female.isnull()]
#Remove offender
GREProg.drop(offender.index,axis=0,inplace=True)
#Remove spurious columns
count = GREProg.describe().loc['count']
GREProg = GREProg[GREProg.columns[count == count['Admitted Per Year']]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Since all of the data is numeric except for the program name, let's assign float values to the data.

In [176]:
for df in [GREProg,GMATProg,LSATProg]:
    for col in df.columns:
        if col != 'Program':
            df[col] = df[col].astype(float)

In [177]:
GMATProg.describe()

Unnamed: 0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,GMAT Analytical Percentile,GMAT Analytical Score,GMAT Composite Score,GMAT Integrated Percentile,...,GMAT Quantitative Percentile,GMAT Quantitative Score,GMAT Verbal Percentile,GMAT Verbal Score,Graduated Per Year,International,LDS,Non-White,Percent Admitted,Total Students
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,90.833333,138.133333,3.603333,1.266667,84.833333,31.166667,49.65,4.35,638.766667,56.466667,...,53.916667,41.45,74.35,36.0,78.533333,15.333333,123.166667,26.5,72.49,128.833333
std,54.982094,109.637269,0.113783,0.628225,41.19911,21.949184,8.230128,0.403733,58.208476,9.102234,...,13.6152,5.32156,7.997187,2.761159,40.639423,21.030137,86.142711,27.631504,13.389497,92.527654
min,51.6,57.4,3.4,0.7,57.0,7.0,39.3,4.0,540.5,48.5,...,35.2,33.4,60.0,30.7,46.0,4.0,60.0,7.0,56.37,60.0
25%,62.85,77.45,3.5675,0.7,62.25,15.5,45.675,4.075,617.375,50.425,...,46.0,38.775,72.6,35.925,53.4,5.25,74.5,10.25,65.74,76.25
50%,68.3,99.9,3.645,1.2,68.5,27.0,48.05,4.3,647.65,54.6,...,54.15,41.65,76.65,36.65,62.6,7.5,94.0,15.5,67.85,96.0
75%,89.65,136.45,3.67,1.775,83.75,43.75,52.75,4.375,685.5,57.65,...,61.475,45.2,77.925,37.675,87.85,9.75,121.75,28.25,82.9125,130.75
max,199.0,353.0,3.71,2.0,166.0,65.0,63.3,5.1,691.4,73.4,...,72.8,47.8,83.2,38.3,153.4,58.0,292.0,80.0,89.9,309.0


In [178]:
LSATProg.describe()

Unnamed: 0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,Graduated Per Year,International,LDS,LSAT Composite Score,Non-White,Percent Admitted,Total Students
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0
std,,,,,,,,,,,,,
min,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0
25%,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0
50%,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0
75%,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0
max,204.4,544.8,3.64,2.7,221.0,140.0,138.4,12.0,375.0,160.8,65.0,36.4,386.0


In [179]:
GREProg.describe()

Unnamed: 0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,GRE Analytical Percentile,GRE Analytical Score,GRE Composite Score,GRE Quantitative Percentile,GRE Quantitative Score,GRE Verbal Percentile,GRE Verbal Score,Graduated Per Year,International,LDS,Non-White,Percent Admitted,Total Students
count,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,14.213043,25.613043,3.60971,3.265217,19.057971,11.347826,59.842029,4.110145,312.843478,61.053623,155.586957,72.053623,157.253623,10.584058,4.130435,28.26087,6.507246,57.33087,30.376812
std,25.052169,43.943088,0.108098,1.390707,24.007893,11.309582,10.764216,0.299089,6.463187,16.249177,5.23756,9.051691,2.742561,19.181292,8.47993,37.711229,10.986118,15.847331,39.617324
min,2.0,3.4,3.36,1.3,1.0,0.0,33.4,3.4,296.8,33.5,138.4,50.0,151.2,1.5,0.0,2.0,0.0,19.81,2.0
25%,4.8,8.2,3.55,2.2,8.0,4.0,54.3,4.0,309.2,48.1,152.0,67.0,155.6,3.4,0.0,12.0,2.0,47.14,13.0
50%,9.0,14.6,3.62,2.8,13.0,7.0,61.8,4.1,311.7,60.1,154.8,71.7,157.0,6.0,2.0,19.0,4.0,58.0,21.0
75%,14.4,27.0,3.69,4.0,19.0,15.0,66.0,4.3,318.4,76.3,159.4,78.2,159.0,11.4,4.0,31.0,8.0,68.66,36.0
max,199.0,353.0,3.86,6.8,166.0,65.0,83.0,4.8,327.4,91.0,165.3,89.6,163.8,153.4,58.0,292.0,80.0,85.71,309.0


Finally, we will set program name as the index of each dataframe. Since there is only one law program, we will get rid of it.

In [180]:
LSATProg = None

GREProg.index = GREProg.Program
GREProg.drop("Program",axis=1)

GMATProg.index = GMATProg.Program
GMATProg.drop("Program",axis=1)

Unnamed: 0_level_0,Admitted Per Year,Applied Per Year,Average GPA,Average Years to Degree,BYU Undergraduate,Female,GMAT Analytical Percentile,GMAT Analytical Score,GMAT Composite Score,GMAT Integrated Percentile,...,GMAT Quantitative Percentile,GMAT Quantitative Score,GMAT Verbal Percentile,GMAT Verbal Score,Graduated Per Year,International,LDS,Non-White,Percent Admitted,Total Students
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accountancy - Professional (MAcc),95.2,146.2,3.71,0.7,88.0,13.0,39.3,4.0,691.4,56.9,...,61.7,45.6,71.3,38.3,96.0,5.0,85.0,10.0,65.12,89.0
Accountancy - Tax (MAcc),63.6,72.4,3.67,0.7,71.0,31.0,63.3,5.1,690.0,52.3,...,72.8,47.8,76.8,37.0,63.4,4.0,71.0,7.0,87.85,72.0
Business Administration (MBA),199.0,353.0,3.55,1.7,166.0,65.0,53.9,4.4,672.0,57.9,...,60.8,44.0,83.2,37.9,153.4,58.0,292.0,80.0,56.37,309.0
Business Administration - Executive Program (MBA),73.0,107.2,3.4,2.0,66.0,23.0,46.8,4.0,615.4,48.5,...,45.5,38.6,76.5,35.8,61.8,9.0,128.0,20.0,68.1,140.0
Information Systems Management (MISM),51.6,57.4,3.67,0.7,57.0,7.0,45.3,4.3,623.3,73.4,...,47.5,39.3,78.3,36.3,50.6,6.0,60.0,11.0,89.9,60.0
Public Administration (MPA),62.6,92.6,3.62,1.8,61.0,48.0,49.3,4.3,540.5,49.8,...,35.2,33.4,60.0,30.7,46.0,10.0,103.0,31.0,67.6,103.0
