In [1]:
#Import dependencies
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 400)

In [7]:
#Read in CSV file
df = pd.read_csv('../data/listings_with_locations.csv')
df.head()

Unnamed: 0,mls_id,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
0,E8018446,https://toronto.listing.ca/286-main-st-911.E8018446.htm#42-1,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
1,C7266728,https://toronto.listing.ca/215-queen-st-606.C7266728.htm#42-2,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
2,W7239426,https://toronto.listing.ca/10-park-lawn-rd-1408.W7239426.htm#42-3,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
3,E8030950,https://toronto.listing.ca/665-queen-st-402.E8030950.htm#42-4,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
4,E8030860,https://toronto.listing.ca/1190-dundas-st-925.E8030860.htm#42-5,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [8]:
#Rearrange columns
df = df[['mls_id', 'property_type', 'address', 'street', 'neighbourhood', 'city', 'price', 'baths', 'beds', 'dens', 'latitude', 'longitude', 'date_scraped', 'url']]
df.head()

Unnamed: 0,mls_id,property_type,address,street,neighbourhood,city,price,baths,beds,dens,latitude,longitude,date_scraped,url
0,E8018446,condo_apartment,286 Main St 911,Main St,East End-Danforth,Toronto,619900,1,1,1,43.687469,-79.301861,2024-01-30,https://toronto.listing.ca/286-main-st-911.E8018446.htm#42-1
1,C7266728,condo_apartment,215 Queen St 606,Queen St,Waterfront Communities C1,Toronto,529000,1,1,0,43.650343,-79.387806,2024-01-30,https://toronto.listing.ca/215-queen-st-606.C7266728.htm#42-2
2,W7239426,condo_apartment,10 Park Lawn Rd 1408,Park Lawn Rd,Mimico,Toronto,624900,1,1,1,43.622946,-79.481658,2024-01-30,https://toronto.listing.ca/10-park-lawn-rd-1408.W7239426.htm#42-3
3,E8030950,condo_apartment,665 Queen St 402,Queen St,South Riverdale,Toronto,899900,2,2,0,43.658361,-79.351159,2024-01-30,https://toronto.listing.ca/665-queen-st-402.E8030950.htm#42-4
4,E8030860,condo_apartment,1190 Dundas St 925,Dundas St,South Riverdale,Toronto,599900,1,1,0,43.665267,-79.341034,2024-01-30,https://toronto.listing.ca/1190-dundas-st-925.E8030860.htm#42-5


In [9]:
#Check the mls_id column to ensure there are no duplicates
df['mls_id'].nunique()

4819

In [10]:
#Check the value counts of property type
df['property_type'].value_counts()

condo_apartment       3226
detached_home          990
freehold _townhome     306
condo_townhome         297
Name: property_type, dtype: int64

In [11]:
#Fix the 'freehold _townhome' values in the property type column
value_to_replace = {'freehold _townhome': 'freehold_townhome'}
df['property_type'] = df['property_type'].replace(value_to_replace)

#Check value counts again
df['property_type'].value_counts()

condo_apartment      3226
detached_home         990
freehold_townhome     306
condo_townhome        297
Name: property_type, dtype: int64

In [12]:
#Check data types
df.dtypes

mls_id            object
property_type     object
address           object
street            object
neighbourhood     object
city              object
price              int64
baths              int64
beds               int64
dens               int64
latitude         float64
longitude        float64
date_scraped      object
url               object
dtype: object

In [33]:
#Get list of unique neighbourhoods
neighbourhoods = df['neighbourhood'].unique()
#Convert to DF
neighbourhoods = pd.DataFrame(neighbourhoods)
#Change column name
neighbourhoods = neighbourhoods.rename(columns = {0: 'neighbourhoods'})
#Set new column as the index
neighbourhoods['index'] = neighbourhoods.index
#Rearrange columns
neighbourhoods = neighbourhoods[['index', 'neighbourhoods']]
neighbourhoods

Unnamed: 0,index,neighbourhoods
0,0,East End-Danforth
1,1,Waterfront Communities C1
2,2,Mimico
3,3,South Riverdale
4,4,Tam O'Shanter-Sullivan
...,...,...
138,138,Runnymede-Bloor West Village
139,139,Willowridge-Martingrove-Richview
140,140,Princess-Rosethorn
141,141,Humberlea-Pelmo Park W4


In [47]:
#Get list of unique beds
beds = df['beds'].unique()
#Convert to DF
beds = pd.DataFrame(beds)
#Sort values
beds = beds.sort_values(by = 0).reset_index()
#Change column name
beds = beds.rename(columns = {0: 'beds'})
#Set new column as the index
beds['index'] = beds.index
#Rearrange columns
beds = beds[['index', 'beds']]
beds

Unnamed: 0,index,beds
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,7
8,8,8
9,9,9


In [46]:
#Get list of unique baths
baths = df['baths'].unique()
#Convert to DF
baths = pd.DataFrame(baths)
#Sort values
baths = baths.sort_values(by = 0).reset_index()
# #Change column name
baths = baths.rename(columns = {0: 'baths'})
# #Set new column as the index
baths['index'] = baths.index
# #Rearrange columns
baths = baths[['index', 'baths']]
baths

Unnamed: 0,index,baths
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,7
8,8,8
9,9,9


In [48]:
#Get list of unique dens
dens = df['dens'].unique()
#Convert to DF
dens = pd.DataFrame(dens)
#Sort values
dens = dens.sort_values(by = 0).reset_index()
#Change column name
dens = dens.rename(columns = {0: 'dens'})
#Set new column as the index
dens['index'] = dens.index
#Rearrange columns
dens = dens[['index', 'dens']]
dens

Unnamed: 0,index,dens
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
5,5,5
6,6,6
7,7,7
8,8,8


## Database Creation

In [13]:
#Import dependencies to automate database creation in PostgreSQL
import sqlalchemy
import psycopg2

#Connect to PostgreSQL
con = psycopg2.connect(user = 'postgres',
                       host = 'localhost',
                       port = '5432',
                       password = 'postgres')

con.autocommit = True
cursor = con.cursor()

In [None]:
#ONLY NEEDS TO BE RUN ONCE
#Create database
sql = '''CREATE DATABASE listings_db'''
cursor.execute(sql)
con.get_dsn_parameters()

In [14]:
#Import modules from SQLalchemy
from sqlalchemy import Column, Date, String, Float, Integer 

#Engine connection to listings_db
engine = sqlalchemy.create_engine('postgresql://postgres:postgres@localhost:5432/listings_db')

#Create table called toronto_listings from dataframe 'df' 
df.to_sql(
    'toronto_listings', 
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={"mls_id": String(100),
           "property_type": String(100),
           "address": String(100),
           "street": String(100),
           "neighbourhood": String(100),
           "city": String(100),
           "price": Integer,
           "baths": Integer,
           "beds": Integer,
           "dens": Integer,
           "latitude": Float,
           "longitude": Float,
           "date_scraped": Date,
           "url": String(100)
    })

#Alter table to set mls_id as the primary key
with engine.connect() as con:
    con.execute('ALTER TABLE toronto_listings ADD PRIMARY KEY ("mls_id")')

In [34]:
#Create table called neighbourhoods
neighbourhoods.to_sql(
    'neighbourhoods', 
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={"index": Integer,
           "neighbourhood": String(100)
    })

#Alter table to set index as the primary key
with engine.connect() as con:
    con.execute('ALTER TABLE neighbourhoods ADD PRIMARY KEY ("index")')

In [51]:
#Create table called beds
beds.to_sql(
    'beds', 
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={"index": Integer,
           "beds": String(100)
    })

#Alter table to set index as the primary key
with engine.connect() as con:
    con.execute('ALTER TABLE beds ADD PRIMARY KEY ("index")')

In [52]:
#Create table called baths
baths.to_sql(
    'baths', 
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={"index": Integer,
           "baths": String(100)
    })

#Alter table to set index as the primary key
with engine.connect() as con:
    con.execute('ALTER TABLE baths ADD PRIMARY KEY ("index")')

In [53]:
#Create table called dens
dens.to_sql(
    'dens', 
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={"index": Integer,
           "dens": String(100)
    })

#Alter table to set index as the primary key
with engine.connect() as con:
    con.execute('ALTER TABLE dens ADD PRIMARY KEY ("index")')

In [54]:
#Close the connection to the PostgreSQL engine
con.close()