In [21]:
import sqlalchemy
import numpy as np
import pgeocode
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Configure DB
config = {
    'host':'localhost',
    'user': 'root',
    # Add password here
    'password': "Buddha2025",
    'db': 'livethere'
}
mysql_db_uri = f'mysql+pymysql://{config["user"]}:{config["password"]}@{config["host"]}/{config["db"]}'
sqlalchemy.create_engine(mysql_db_uri)

Engine(mysql+pymysql://root:***@localhost/livethere)

In [3]:
# pip install ipython-sql

In [4]:
# Copy this 
mysql_db_uri

'mysql+pymysql://root:Buddha2025@localhost/livethere'

In [5]:
%load_ext sql
%sql mysql+pymysql://root:Buddha2025@localhost/livethere # put your mysql_db_uri

0 rows affected.


[]

In [6]:
%%sql
show tables

 * mysql+pymysql://root:***@localhost/livethere
8 rows affected.


Tables_in_livethere
AverageUtilityFee
MainCampusMap
Rental
RentalRange
Restaurant
RestaurantRange
University
YelpSchema


In [7]:
# Retrieve DB
query = %sql SELECT * FROM Rental INNER JOIN RentalRange ON Rental.id = RentalRange.rentalId
df = query.DataFrame()

 * mysql+pymysql://root:***@localhost/livethere
137756 rows affected.


In [8]:
df.dtypes

id                           int64
rentalPrice                  int64
postalCode                  object
longitude                   object
latitude                    object
stubId                       int64
bathroomCount                int64
bedroomCount                 int64
lastUpdatedDate             object
propertyType                object
universityId                 int64
rentalId                     int64
rentToUniversityDistance    object
dtype: object

In [9]:
sum(df['universityId'].isna())

0

In [10]:
# Format Data
df['postalCode'] = df['postalCode'].apply(lambda a: a[:3] + " " + a[-3:]).astype(str)
df['propertyType'] = df['propertyType'].astype(str)
df['longitude'] = df['longitude'].astype(float)
df['latitude'] = df['latitude'].astype(float)
df['universityId'] = df['universityId'].astype(int)
df['rentToUniversityDistance'] = df['rentToUniversityDistance'].astype(float)
df['rentalPrice'] = df['rentalPrice'].astype(float)

In [11]:
# Correlation analysis
np.abs(df.corr()['rentalPrice'])

id                          0.077310
rentalPrice                 1.000000
longitude                   0.020645
latitude                    0.027670
stubId                      0.032530
bathroomCount               0.538304
bedroomCount                0.487404
universityId                0.002207
rentalId                    0.077310
rentToUniversityDistance    0.011898
Name: rentalPrice, dtype: float64

In [12]:
nomi = pgeocode.Nominatim('ca')
counties = nomi.query_postal_code(list(df['postalCode']))['county_name']
counties.head(3)

0    Mississauga
1    Mississauga
2    Mississauga
Name: county_name, dtype: object

In [13]:
# Features
# - bathroomCount
# - bedroomCount
# - propertyType
# - counties [53] ?  # exclude

In [26]:
# Preprocess features
scaler = MinMaxScaler()
one_hot = OneHotEncoder()
x = scaler.fit_transform(df[['bathroomCount', 'bedroomCount']])

p = df['propertyType'].to_numpy().astype(str).reshape(-1, 1)
p = one_hot.fit_transform(p).toarray()

c = counties.to_numpy().astype(str).reshape(-1, 1)
c = one_hot.fit_transform(c).toarray()

x = np.concatenate([x, c], axis=1)
x = np.concatenate([x, p], axis=1)

y = df['rentalPrice']
y = y.to_numpy()

# Train-test split.
RANDOM_STATE = 300
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=RANDOM_STATE)

train_x.shape, train_y.shape, test_x.shape, test_y.shape

((110204, 68), (110204,), (27552, 68), (27552,))

In [15]:
# Fit linear regression
lr = LinearRegression().fit(train_x, train_y)
lr_prd = lr.predict(test_x)

In [16]:
train_x.shape, train_y.shape

((110204, 15), (110204,))

In [17]:
# RMSE 
np.sqrt(np.mean((test_y - lr_prd) ** 2))

759.725502696055

In [27]:
rf = RandomForestRegressor().fit(train_x, train_y)
rf_prd = rf.predict(test_x)

In [28]:
np.sqrt(np.mean((test_y - rf_prd) ** 2))

567.7804482080616

In [29]:
# Basecase RMSE
# TODO: try the query
np.sqrt(np.mean((test_y - np.mean(test_y)) **2))

966.336157328159

In [30]:
def property_type_alias_mapper(alias):
    """ map property property alias names to existing property 
    types in the database
    Args:
        alias (string))
    """

    if (alias == 'condo'):
        return ['apartment', 'condo']
    if (alias == 'house'):
        return ['house', 'loft', 'duplex', 'multi-unit']
    if (alias == 'town house'):
        return ['town house']
    if (alias == 'bachelor'):
        return ['bachelor', 'studio']

    # TODO: Handle unmapped alias error
    return None

def get_average_rental(id, min_d, max_d, pT, bedCount, bathCount, df):
    min_distance_km = min_d
    max_distance_km = max_d
    property_types = property_type_alias_mapper(pT)
    bed_count = bedCount
    bath_count = bathCount
    
    c1 = df['universityId'] == id
    c2 = df['rentToUniversityDistance'] >= min_distance_km
    c3 = df['rentToUniversityDistance'] <= max_distance_km
    
    query_result = df[c1 & c2 & c3]
    qeury_result = query_result[query_result['propertyType'].isin(property_types)]
    rental_price = query_result['rentalPrice']
    
    std = rental_price.std()
    mean = rental_price.mean()
    
    upper = mean + std * 1.5
    lower = mean - std * 1.5
    
    
    c1 = rental_price < upper
    c2 = rental_price > lower

    rental_price = rental_price[c1 & c2]

    
    return rental_price

In [31]:
len(np.unique(df['universityId']))

52

In [32]:
df.head()

Unnamed: 0,id,rentalPrice,postalCode,longitude,latitude,stubId,bathroomCount,bedroomCount,lastUpdatedDate,propertyType,universityId,rentalId,rentToUniversityDistance
0,1,3600.0,L5G 1H9,-79.571522,43.56877,330518,4,3,2020-04-21,town house,234,1,19.4
1,1,3600.0,L5G 1H9,-79.571522,43.56877,330518,4,3,2020-04-21,town house,236,1,13.3
2,1,3600.0,L5G 1H9,-79.571522,43.56877,330518,4,3,2020-04-21,town house,242,1,9.8
3,1,3600.0,L5G 1H9,-79.571522,43.56877,330518,4,3,2020-04-21,town house,252,1,9.8
4,1,3600.0,L5G 1H9,-79.571522,43.56877,330518,4,3,2020-04-21,town house,253,1,8.8


In [37]:
get_average_rental(252, 0, 15, 'house', 3, 2, df).std()

536.6249442213721