In [270]:
from bs4 import BeautifulSoup
import requests
import re
import json
import time, os

In [127]:
def clean_string (string):
    return string.replace('\n', '').strip()

def get_teachers_staff(elem):
    '''
    Input: Takes teachers & staff element and parses out the several descriptions and values
    Output: Dictionary with descriptions and values in "Teachers & staff" section
    '''
    elem_desc = elem.find_all(class_ = 'col-xs-6 rating-score-item__label')
    desc_list = [clean_string(row.text) for row in elem_desc]
    
    elem_vals = elem.find_all(class_ = 'rating-score-item__score')
    elem_vals = [clean_string(row.text).replace(' ', '') for row in elem_vals]

    other_desc = elem.find_all(class_ = 'test-score-container clearfix')
    desc_list = desc_list + [clean_string(row.findNext().text) for row in other_desc]
    
    other_vals = elem.find_all(class_ = 'score')
    elem_vals = elem_vals + [row.text.replace('%', '') for row in other_vals]
    
    return dict(zip(desc_list, elem_vals))

def pull_out_json(string):
    idx = string.find('>')
    idx_end = string.rfind('<')
    y = string[idx + 1 : idx_end]
    return json.loads(y)

def student_demographics(elem):
    # student demographics
    demographics = elem.find_all('div', class_= 'legend-title')
    dem_list = [row.text.replace('%', '') for row in demographics]
    dem_list_desc = dem_list[0::2]
    dem_list_desc_explicit = [('Pop ' + string) for string in dem_list_desc]
    dem_list_vals = dem_list[1::2]

    demographics = dict(zip(dem_list_desc_explicit, dem_list_vals))

    return demographics

def get_college(elem):
    #temp = json_dict['College readiness']
    college_dict = {}
    try:
        college_list = elem['data'][0]['data'][0]['values']

        for dict_ in college_list:
            college_dict[dict_['breakdown']] = dict_['score']
    except:
        pass
        
    return college_dict

def get_advanced_courses(elem):
    courses_dict = {}
    courses_list = elem['courses']

    for dict_ in courses_list:
        courses_dict[dict_['breakdown']] = dict_['score']
    
    return courses_dict

def get_race_ethnicity(temp):
    race_dict = {}
    scores_dict = {}
    temp_dict = {}

    for elem in temp:
        if elem['data']:
            temp_2 = elem['data']
            for sub in temp_2:
                title = sub['title']
                if title != 'Overview':
                    temp3 = sub['values']
                    for sub2 in temp3:
                        try:
                            new_title = title + ';' + sub2['breakdown']
                            scores_dict[new_title] = sub2['score']
                            race_dict.update(scores_dict)
                        except:
                            pass

    return race_dict

def get_basic_info(soup):
    state = soup.find(class_='school-profile').find('a').text
    if soup.find(class_='school-contact__item school-contact__district-name') != None:
        district = soup.find(class_='school-contact__item school-contact__district-name').find('a').text
    else:
        district = ''
    school_name = soup.find(class_='school-name').text
    address = soup.find(class_='school-contact__item school-contact__address').text
    rating = soup.find(class_='gs-rating-with-label__rating').text
    grades = soup.find(class_='school-info').find_all('div', class_ = 'label')[-3].findNext().text
    students = soup.find(class_='school-info').find_all('div', class_ = 'label')[-2].findNext().text
    school_type = soup.find(class_='school-info').find_all('div', class_ = 'label')[-1].findNext().text

    headers = ['state', 'district', 'school_name', 'address', 'rating', 'grades', 'students', 'school_type']
    school_dict = dict(zip(headers, [state, district, school_name, clean_string(address), clean_string(rating), grades, students, school_type]))

    return school_dict

def create_section_dict(soup):
    section_elem_list = soup.find_all('div', class_= 'profile-section-slot')

    json_dict = {}
    html_dict = {}

    for section in section_elem_list:
        if section.find('script'):
            sublist = section.find_all('script')
            for sub_elem in sublist:
                json_elem = pull_out_json(str(sub_elem))
                try:
                    title = json_elem['title']
                    json_dict[title] = json_elem
                except:
                    pass
        elif section.find('a')['name']:
            html_dict[section.find('a')['name']] = section

    return json_dict, html_dict

def pull_json_vals(json_dict):
    json_values = {}
    temp_dict = {}

    for k, v in json_dict.items():
        if k == 'College readiness':
            temp_dict = get_college(v)
        elif k == 'College success':
            temp_dict = get_college(v)
        elif k == 'Advanced courses':
            temp_dict = get_advanced_courses(v)
        elif k == 'Race/ethnicity':
            temp = json_dict['Race/ethnicity']['data']
            temp_dict = get_race_ethnicity(temp)
        json_values.update(temp_dict)

    return json_values

def pull_html_vals(html_dict):
    html_values = {}
    temp_dict = {}

    for k, v in html_dict.items():
        if k == 'Students':
            temp_dict = student_demographics(v)
        elif k == 'Teachers_staff':
            temp_dict = get_teachers_staff(v)
        html_values.update(temp_dict)

    return html_values

In [1]:
import pickle

picklefile_name = 'all_states_highschool_names.pkl'
with open(picklefile_name, 'rb') as picklefile: 
    allsates_urls_dict = pickle.load(picklefile)

In [2]:
allsates_urls_dict.keys()

dict_keys(['alabama', 'washington-dc', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new-hampshire', 'new-jersey', 'new-mexico', 'new-york/new-york', 'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode-island', 'south-carolina', 'south-dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west-virginia', 'wisconsin', 'wyoming', 'florida'])

In [379]:
school_list = []
print(len(school_list))

page_errors = []
len(page_errors)

0


0

In [380]:
state_list = ['maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new-hampshire', 'new-jersey', 'new-mexico', 'new-york/new-york', 'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode-island', 'south-carolina', 'south-dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington', 'west-virginia', 'wisconsin', 'wyoming']

agg = 0
for state in state_list:
    #print(len(allsates_urls_dict[state].keys()))
    agg += len(allsates_urls_dict[state].keys())
agg

24154

In [381]:
#current_state_arr = np.array(list(allsates_urls_dict[state_1].keys()) + list(allsates_urls_dict[state_2].keys()))
# current_state_arr = np.array(list(allsates_urls_dict[state_1].keys()) + list(allsates_urls_dict[state_2].keys()) + list(allsates_urls_dict[state_3].keys()) + list(allsates_urls_dict[state_4].keys()) + list(allsates_urls_dict[state_5].keys()))
full_list = []

for state in state_list:
    full_list.extend(list(allsates_urls_dict[state].keys()))
current_state_arr = np.array(full_list)

del(full_list)

In [382]:
len(current_state_arr)

24154

In [79]:
# test_dict = {}

# test_dict['/florida/lake-worth/10002-Trinity-Christian-Academy/'] = 1

In [297]:

url = "https://www.greatschools.org{}"

for school in filtered_array:
    final_url = url.format(school)
    response = requests.get(final_url)
    if response.status_code == 200:
        page = response.text
        soup = BeautifulSoup(page)

        school_dict = get_basic_info(soup)

        if school_dict['students'] != '0':
            json_dict, html_dict = create_section_dict(soup)
            json_values = pull_json_vals(json_dict)
            html_values = pull_html_vals(html_dict)

            school_dict.update(json_values)
            school_dict.update(html_values)
    else:
        page_errors.append(final_url)

        school_list.append(school_dict)

In [401]:
url = "https://www.greatschools.org{}"

for idx, school in enumerate(filtered_array):
    final_url = url.format(school)
    response = requests.get(final_url)
    if response.status_code == 200:
        page = response.text
        soup = BeautifulSoup(page)

        school_dict = get_basic_info(soup)

        if school_dict['students'] != '0':
            json_dict, html_dict = create_section_dict(soup)
            json_values = pull_json_vals(json_dict)
            html_values = pull_html_vals(html_dict)

            school_dict.update(json_values)
            school_dict.update(html_values)

        school_list.append(school_dict)
    else:
        page_errors.append(final_url)

    if ((idx + 1) % 1000 == 0):
        print(f'sleeping: {time.ctime()}')
        time.sleep(450)
        print(f'{idx}: {time.ctime()}')

sleeping: Sun Jan 17 08:16:35 2021
999: Sun Jan 17 08:24:05 2021
sleeping: Sun Jan 17 08:34:46 2021
1999: Sun Jan 17 08:42:16 2021
sleeping: Sun Jan 17 08:53:31 2021
2999: Sun Jan 17 09:01:01 2021
sleeping: Sun Jan 17 09:12:11 2021
3999: Sun Jan 17 09:19:41 2021
sleeping: Sun Jan 17 09:31:39 2021
4999: Sun Jan 17 09:39:09 2021
sleeping: Sun Jan 17 09:50:23 2021
5999: Sun Jan 17 09:57:53 2021
sleeping: Sun Jan 17 10:11:02 2021
6999: Sun Jan 17 10:18:32 2021
sleeping: Sun Jan 17 10:30:19 2021
7999: Sun Jan 17 10:37:49 2021
sleeping: Sun Jan 17 10:50:54 2021
8999: Sun Jan 17 10:58:24 2021
sleeping: Sun Jan 17 11:10:22 2021
9999: Sun Jan 17 11:17:52 2021
sleeping: Sun Jan 17 11:30:11 2021
10999: Sun Jan 17 11:37:41 2021
sleeping: Sun Jan 17 11:46:31 2021
11999: Sun Jan 17 11:54:01 2021
sleeping: Sun Jan 17 12:04:51 2021
12999: Sun Jan 17 12:12:21 2021
sleeping: Sun Jan 17 12:23:39 2021
13999: Sun Jan 17 12:31:09 2021
sleeping: Sun Jan 17 12:46:48 2021
14999: Sun Jan 17 12:54:18 2021
sleepi

In [402]:
len(school_list)

26154

In [408]:
rest_list = school_list.copy()

In [409]:
len(rest_list)

26154

In [284]:
import random

time.sleep(.5+2*random.random())

In [315]:
final_url

'https://www.greatschools.org/kentucky/mount-sterling/1917-Hillcrest-Hall-Treatment-Center/'

In [376]:
# print(json_dict.keys())
# print(html_dict.keys())

In [3]:
#list(allsates_urls_dict['kentucky']).index('/kentucky/mount-sterling/1917-Hillcrest-Hall-Treatment-Center/')
list(allsates_urls_dict['north-dakota'])

['/north-dakota/blaisdell/123-Berthold-High-School/',
 '/north-dakota/drayton/161-Drayton-High-School/',
 '/north-dakota/glen-ullin/224-Glen-Ullin-High-School/',
 '/north-dakota/hatton/264-Hatton-High-School/',
 '/north-dakota/hettinger/270-Hettinger-High-School/',
 '/north-dakota/hillsboro/272-Hillsboro-High-School/',
 '/north-dakota/kulm/284-Kulm-High-School/',
 '/north-dakota/tower-city/321-Maple-Valley-High-School/',
 '/north-dakota/washburn/516-Washburn-High-School/',
 '/north-dakota/kindred/57-Kindred-High-School/',
 '/north-dakota/drake/62-Drake-High-School/',
 '/north-dakota/towner/678-Tgu-Towner-High-School/',
 '/north-dakota/glenfield/93-Midkota-High-School/',
 '/north-dakota/park-river/1325-Park-River-High-School/',
 '/north-dakota/buxton/148-Central-Valley-High-School/',
 '/north-dakota/edgeley/167-Edgeley-High-School/',
 '/north-dakota/flasher/215-Flasher-High-School/',
 '/north-dakota/lidgerwood/300-Lidgerwood-High-School/',
 '/north-dakota/maddock/307-Maddock-High-School

In [392]:
import numpy as np

filtered_list = current_state_arr[3000:]

filtered_array = np.array(filtered_list)

In [393]:
len(filtered_array)

21154

In [377]:
#school_list[838]

In [None]:
rest_list

In [419]:
len(rest_list)

26154

In [420]:
indiana_through_wyoming.extend(rest_list)

In [421]:
len(indiana_through_wyoming)

29258

In [356]:
del(alabama_through_illinois)

In [422]:
#alabama - 820
#washington dc - 92

picklefile_name = 'indiana_through_wyoming.pkl'
with open(picklefile_name, 'wb') as picklefile:
    pickle.dump(indiana_through_wyoming, picklefile)

In [378]:
count = 0

for v in allsates_urls_dict.values():
    count += len(v)

count

42657