In [25]:
# Dependencies
import pandas as pd

from bs4 import BeautifulSoup
import requests
import pymongo

from sqlalchemy import create_engine
from config import username
from config import password

ModuleNotFoundError: No module named 'config'

# Datasource 1

In [None]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [None]:
# Define database and collection
db = client.schools_db
collection = db.applecross

In [None]:
# URL of page to be scraped - Data source 1 – REIWA Applecross suburb profile, secondary schools
url = 'https://reiwa.com.au/suburb/applecross/'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

print(soup)

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--><!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]--><!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]--><!--[if gt IE 8]><!--><html class="no-js" lang="en">
<!--<![endif]-->
<head><title>
	Applecross Suburb Profile | Property Market, House Prices and More - REIWA
</title><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><meta content="Interested in buying, renting or investing in Applecross? View median house prices, growth rates, demographics, schools and more with our suburb profile." name="description"/><meta content="REIWA" name="author"/><link href="/WorkArea/FrameworkUI/css/ektron.stylesheet.ashx?id=-991207082+-409962787+-2046276955+-1495356069+490163735+-719310687+-384744968+-987815539+2007979625+-2081147131+-545874270+1317633679+-1252017282+-707468958+-2117417196+-1057278288+1510350268+-559785714+-243909663+1684155

In [None]:
# Examine the results, then determine element that contains sought info
results = soup.find_all('td', class_='text-left col-xs-8')[4]('span')
print(results)

[<span id="ctl00_uxContentHolder_dzMainMid_uxColumnDisplay_ctl00_uxControlColumn_ctl00_uxWidgetHost_uxWidgetHost_widget_ctl00_StatsOtherNearbySchools"> Applecross Senior High School, Aquinas College, Santa Maria College, Como Secondary College, Melville Senior High School, Corpus Christi College, Penrhos College, Rossmoyne Senior High School, All Saints' College, Wesley College</span>]


In [None]:
# pick out only the data elements
data_schools = [element.text for element in results][0]
print(data_schools)

 Applecross Senior High School, Aquinas College, Santa Maria College, Como Secondary College, Melville Senior High School, Corpus Christi College, Penrhos College, Rossmoyne Senior High School, All Saints' College, Wesley College


In [None]:
split_data=data_schools.split(',')
print(split_data)

[' Applecross Senior High School', ' Aquinas College', ' Santa Maria College', ' Como Secondary College', ' Melville Senior High School', ' Corpus Christi College', ' Penrhos College', ' Rossmoyne Senior High School', " All Saints' College", ' Wesley College']


In [None]:
# Add the list of schools to a dataframe for filtering
# rename column
applecross_df = pd.DataFrame(split_data).rename(columns = {0:'School_Name'})

# create id
applecross_df.insert(0, 'ID', range(0, 0 + len(applecross_df)))

applecross_df.head()


Unnamed: 0,ID,School_Name
0,0,Applecross Senior High School
1,1,Aquinas College
2,2,Santa Maria College
3,3,Como Secondary College
4,4,Melville Senior High School


In [None]:
# make it upper case for consistency with other datasets
applecross_df['School_Name']=applecross_df['School_Name'].str.upper()

applecross_df.head()


Unnamed: 0,ID,School_Name
0,0,APPLECROSS SENIOR HIGH SCHOOL
1,1,AQUINAS COLLEGE
2,2,SANTA MARIA COLLEGE
3,3,COMO SECONDARY COLLEGE
4,4,MELVILLE SENIOR HIGH SCHOOL


# Datasource 2

In [None]:
# CSV for 2)	Data source 2 – Alphabetical List of Western Australian Schools (XLSX)

# Store CSV into df
csv_file = "Resources/WASchoolsList_cleaned.csv"
WASchools_df = pd.read_csv(csv_file)
WASchools_df.head()

Unnamed: 0,Code,School Name,Street,Suburb,State,Postcode,Postal Street,Postal Suburb,Postal State,Postal Postcode,...,Y06,UPR,Y07,Y08,Y09,Y10,Y11,Y12,USE,Total Students
0,5001,ADAM ROAD PRIMARY SCHOOL,HOTCHIN STREET,SOUTH BUNBURY,WA,6230,HOTCHIN STREET,SOUTH BUNBURY,WA,6230,...,45.0,,,,,,,,,446
1,1463,AL-AMEEN COLLEGE,57 SOUTHGATE ROAD,LANGFORD,WA,6147,57 SOUTHGATE ROAD,LANGFORD,WA,6147,...,96.0,,83.0,79.0,66.0,67.0,48.0,45.0,,1136
2,1397,AL-HIDAYAH ISLAMIC SCHOOL,CNR HEDLEY ST/NYAMUP WAY,BENTLEY,WA,6102,CNR HEDLEY ST/NYAMUP WAY,BENTLEY,WA,6102,...,24.0,,,,,,,,,202
3,8001,ALBANY COMMUNITY KINDERGARTEN,136 SERPENTINE ROAD,ALBANY,WA,6330,PO BOX 5670,ALBANY,WA,6332,...,,,,,,,,,,29
4,5002,ALBANY PRIMARY SCHOOL,SUFFOLK STREET,ALBANY,WA,6330,SUFFOLK STREET,ALBANY,WA,6330,...,58.0,,,,,,,,,426


In [None]:
# look at all the columns available
WASchools_df.columns

Index(['Code', 'School Name', 'Street', 'Suburb', 'State', 'Postcode',
       'Postal Street', 'Postal Suburb', 'Postal State', 'Postal Postcode',
       'Latitude', 'Longitude', 'Phone', 'Education Region',
       'Broad Classification', 'Classification Group', 'Low Year', 'High Year',
       'KIN', 'PPR', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'UPR', 'Y07',
       'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'USE', 'Total Students'],
      dtype='object')

In [None]:
# create new data with only relevant columns
new_WASchools_df = WASchools_df[['School Name', 'Classification Group', 'Y12']].copy()
new_WASchools_df.head()


Unnamed: 0,School Name,Classification Group,Y12
0,ADAM ROAD PRIMARY SCHOOL,PRIMARY SCHOOLS,
1,AL-AMEEN COLLEGE,NON-GOVERNMENT,45.0
2,AL-HIDAYAH ISLAMIC SCHOOL,NON-GOVERNMENT,
3,ALBANY COMMUNITY KINDERGARTEN,COMMUNITY KINDERGARTENS,
4,ALBANY PRIMARY SCHOOL,PRIMARY SCHOOLS,


In [None]:
# remove instances where Y12 is NaN
nan_value = float("NaN")
new_WASchools_df.replace("", nan_value, inplace=True)
new_WASchools_df.dropna(subset = ["Y12"], inplace=True)

# create id
new_WASchools_df.insert(0, 'ID', range(0, 0 + len(new_WASchools_df)))

# rename columns
new_WASchools_df = pd.DataFrame(new_WASchools_df).rename(columns = {'School Name':'School_Name'})
new_WASchools_df = pd.DataFrame(new_WASchools_df).rename(columns = {'Classification Group':'Classification_Group'})

new_WASchools_df.head()


Unnamed: 0,ID,School_Name,Classification_Group,Y12
1,0,AL-AMEEN COLLEGE,NON-GOVERNMENT,45.0
5,1,ALBANY SECONDARY EDUCATION SUPPORT CENTRE,EDUCATION SUPPORT,17.0
6,2,ALBANY SENIOR HIGH SCHOOL,SECONDARY SCHOOLS,130.0
8,3,ALKIMOS BAPTIST COLLEGE,NON-GOVERNMENT,12.0
12,4,ALL SAINTS' COLLEGE,NON-GOVERNMENT,144.0


In [None]:
# get unique classification groups
classification = new_WASchools_df["Classification_Group"].unique()
classification_df = pd.DataFrame(classification).rename(columns = {0:'Classification_Group'})
# create id
classification_df.insert(0, 'ID', range(0, 0 + len(classification_df)))
classification_df.head()


Unnamed: 0,ID,Classification_Group
0,0,NON-GOVERNMENT
1,1,EDUCATION SUPPORT
2,2,SECONDARY SCHOOLS
3,3,K-12 SCHOOLS
4,4,DISTRICT HIGH SCHOOLS


# Datasource 3

In [None]:
# <!-- URL for Data source 3 – WA School Ranking – 2020 -->
url = 'https://bettereducation.com.au/results/wa/wace.aspx'

# <!-- read table -->
tables = pd.read_html(url)
tables

[     Unnamed: 0  Better Education Rank                                School  \
 0             1                      1                   Perth Modern School   
 1             2                      2  St Hilda's Anglican School for Girls   
 2             3                      3          Christ Church Grammar School   
 3             4                      4                       Penrhos College   
 4             5                      5             Methodist Ladies' College   
 ..          ...                    ...                                   ...   
 134         135                    135           Balcatta Senior High School   
 135         136                    136         Safety Bay Senior High School   
 136         137                    137          Darling Range Sports College   
 137         138                    138                       Gilmore College   
 138         139                    139                 Cecil Andrews College   
 
      Median ATAR  No. eli

In [None]:
# check table type
type(tables)

list

In [None]:
# insert table into a df and drop the first column
df = tables[0].drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Better Education Rank,School,Median ATAR,No. eligible Yr 12 students,No. Students with an ATAR,% students with an ATAR,Trend / Compare
0,1,Perth Modern School,97.55,242,242,100.0,Trend / Compare
1,2,St Hilda's Anglican School for Girls,92.7,147,134,91.16,Trend / Compare
2,3,Christ Church Grammar School,92.5,190,175,92.11,Trend / Compare
3,4,Penrhos College,90.65,135,116,85.93,Trend / Compare
4,5,Methodist Ladies' College,90.55,126,112,88.89,Trend / Compare


In [None]:
# only keep data that is useful for end user
df = df[['Better Education Rank','School','Median ATAR']]
df.head()

Unnamed: 0,Better Education Rank,School,Median ATAR
0,1,Perth Modern School,97.55
1,2,St Hilda's Anglican School for Girls,92.7
2,3,Christ Church Grammar School,92.5
3,4,Penrhos College,90.65
4,5,Methodist Ladies' College,90.55


In [None]:
cols = list(df.columns)
# rename columns to something more descriptive 
cols[0] = "2020_ATAR_Secondary_School_Ranking"
# rename School to School Name to tie in with Data Source 2
cols[1] = "School_Name"
cols[2] = "Median_ATAR"
df.columns = cols
df.head()

Unnamed: 0,2020_ATAR_Secondary_School_Ranking,School_Name,Median_ATAR
0,1,Perth Modern School,97.55
1,2,St Hilda's Anglican School for Girls,92.7
2,3,Christ Church Grammar School,92.5
3,4,Penrhos College,90.65
4,5,Methodist Ladies' College,90.55


In [None]:
# make the school name uppercase to enable easier merge for end user
df['School_Name'] = df['School_Name'].str.upper()
# create id
df.insert(0, 'ID', range(0, 0 + len(df)))
df.head()

Unnamed: 0,ID,2020_ATAR_Secondary_School_Ranking,School_Name,Median_ATAR
0,0,1,PERTH MODERN SCHOOL,97.55
1,1,2,ST HILDA'S ANGLICAN SCHOOL FOR GIRLS,92.7
2,2,3,CHRIST CHURCH GRAMMAR SCHOOL,92.5
3,3,4,PENRHOS COLLEGE,90.65
4,4,5,METHODIST LADIES' COLLEGE,90.55


# LOAD

In [None]:
rds_connection_string = f"{username}:{password}@localhost:5432/schools_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

NameError: name 'username' is not defined

In [None]:
# check for tables - NOTE this is created in pgadmin before hand
engine.table_names()

  engine.table_names()


[]

In [None]:
# note - delete all tables in pg admin schools_db before running the next step

In [None]:
# NOTE - DON'T RUN MORE THAN ONCE BECUASE IT APPENDS

# Dataset 1
new_WASchools_df.to_sql(name='WA_Schools_Y12', con=engine, if_exists='append', index=False)

# Dataset 2
classification_df.to_sql(name='WA_Schools_classification', con=engine, if_exists='append', index=False)

# Dataset 3
applecross_df.to_sql(name='Applecross_secondary_schools', con=engine, if_exists='append', index=False)

# Dataset 4
df.to_sql(name='Secondary_schools_ranking', con=engine, if_exists='append', index=False)

In [None]:
# confirm Dataset 1 has been added
pd.read_sql_query('select * from WA_Schools_Y12', con=engine).head()

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "wa_schools_y12" does not exist
LINE 1: select * from WA_Schools_Y12
                      ^

[SQL: select * from WA_Schools_Y12]
(Background on this error at: http://sqlalche.me/e/14/f405)