# Scraping

In [31]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

driver = webdriver.PhantomJS()
driver.get('https://beta.moe.gov.sg/schoolfinder/?journey=Secondary%20school')

school_links = []

for i in range(0, 8):
    try:
        element_present = EC.presence_of_element_located((By.CSS_SELECTOR, 'a.moe-card'))
        WebDriverWait(driver, 8).until(element_present)

        schools = driver.find_elements_by_css_selector('a.moe-card')

        if len(schools) == 0:
            print("On page " + str(i) + " no schools were found")
        else:
            print("Page " + str(i) + " found results YAYYY")
            for school in schools:
                school_links.append(school.get_attribute('href'))
        
        button = driver.find_elements_by_css_selector('.moe-pagination__btn.dir--right')
        if len(button) == 0:
            print('button problem at page ' + str(i))
            break
        else:
            button[0].click()

    except TimeoutException:
        print ("Timed out waiting for page " + str(i) + " to load")

Page 0 found results YAYYY
Page 1 found results YAYYY
Page 2 found results YAYYY
Page 3 found results YAYYY
Page 4 found results YAYYY
Page 5 found results YAYYY
Page 6 found results YAYYY
Page 7 found results YAYYY


In [121]:
from selenium.common.exceptions import NoSuchElementException

In [137]:
table_failed = []
page_failed = []
results = []

for school in school_links:
    try:
        driver.get(school)
        element_present = EC.presence_of_element_located((By.TAG_NAME, 'table'))
        WebDriverWait(driver, 8).until(element_present)

        table = driver.find_element_by_xpath('//*[@id="moe-school-finder"]/div/div/div[2]/div[1]/div/div[2]/div/table')
        results.append(table.get_attribute('innerHTML'))

    except TimeoutException:
        page_failed.append(school)

    except NoSuchElementException:
        table_failed.append(school)

In [146]:
import pickle
with open('../data/school_raw.txt', "wb") as fp:
    pickle.dump(results, fp)
    fp.close()

In [150]:
not_scraped = page_failed + table_failed

In [221]:
with open("../data/not_scraped.txt", "wb") as fp:
   pickle.dump(not_scraped, fp)
   fp.close()

In [170]:
with open("../data/school_links.txt", "wb") as fp:
   pickle.dump(school_links, fp)
   fp.close()

In [223]:
with open('../data/not_scrapedTEXT.txt', 'w') as fp:
    for school in not_scraped:
        fp.write('%s\n' % school)

# Parsing

In [None]:
from bs4 import BeautifulSoup

In [215]:
def parse_result(result):
    soup = BeautifulSoup(result, 'html.parser')
    output = {}
    for row in soup.findAll('tr'):
        data = row.findAll('td')
        if len(data) == 0:
            continue
        elif len(data) == 1:
            output[row.find('th').text] = data[0].text
        else:
            output[row.find('th').text] = {'affliated': data[0].text, 'non-affliated': data[1].text}
    return output

In [216]:
# e.g. of non-affliated
print(parse_result(results[0]))

# e.g. of affliated
print(parse_result(results[5]))

{'Express': '203 - 231', 'Normal (Academic)': '168 - 197', 'Normal (Technical)': '132 - 157'}
{'Integrated Programme': {'affliated': ' - ', 'non-affliated': '256 - 274'}, 'Express': {'affliated': '235 - 255', 'non-affliated': '252 - 255'}, 'Normal (Academic)': {'affliated': ' - ', 'non-affliated': ' - '}, 'Normal (Technical)': {'affliated': ' - ', 'non-affliated': ' - '}}


In [191]:
def get_sch_from_link(link):
    output = link.split('?school=')[1]
    return output

get_sch_from_link(school_links[0])

'admiralty-secondary-school'

In [193]:
successful_schools = [link for link in school_links if link not in not_scraped]

In [217]:
data_json = []
for link, result in zip(successful_schools, results):
    result_output = parse_result(result)
    result_output['school'] = get_sch_from_link(link)
    data_json.append(result_output)

In [209]:
import pandas as pd

In [218]:
df = pd.io.json.json_normalize(data_json)
df

Unnamed: 0,Express,Normal (Academic),Normal (Technical),school,Integrated Programme.affliated,Integrated Programme.non-affliated,Express.affliated,Express.non-affliated,Normal (Academic).affliated,Normal (Academic).non-affliated,Normal (Technical).affliated,Normal (Technical).non-affliated
0,203 - 231,168 - 197,132 - 157,admiralty-secondary-school,,,,,,,,
1,224 - 239,177 - 196,131 - 156,ahmad-ibrahim-secondary-school,,,,,,,,
2,245 - 260,188 - 199,140 - 157,anderson-secondary-school,,,,,,,,
3,229 - 242,170 - 199,130 - 153,ang-mo-kio-secondary-school,,,,,,,,
4,241 - 259,-,-,anglican-high-school,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
112,218 - 233,166 - 198,118 - 154,yuan-ching-secondary-school,,,,,,,,
113,188 - 229,153 - 181,116 - 157,yuhua-secondary-school,,,,,,,,
114,188 - 230,152 - 193,123 - 156,yuying-secondary-school,,,,,,,,
115,210 - 237,169 - 198,129 - 158,zhenghua-secondary-school,,,,,,,,


In [219]:
df.to_csv('data/school_scores.csv')