In [1]:
import pandas as pd
import joblib

In [2]:
lagos = pd.read_csv("data/lagos-rent.csv")

In [3]:
lagos.head()

Unnamed: 0,Title,More Info,Price,Serviced,Newly Built,Furnished,Bedrooms,Bathrooms,Toilets,City,Neighborhood
0,Newly Built 4 Bedroom Semi Detached House With Bq,4 BEDROOM HOUSE FOR RENT,"5,000,000/year",0,1,0,4 beds,4 baths,5 Toilets,Lekki,Agungi
1,Superb 4 Bedroom Semi Detached Duplex With Bq,4 BEDROOM HOUSE FOR RENT,"5,000,000/year",0,1,0,4 beds,4 baths,5 Toilets,Lekki,Other Lekki
2,Furnished 2 Bedroom Terrace Duplex For Rent !!!,2 BEDROOM HOUSE FOR RENT,3500000,1,0,0,2 beds,3 baths,3 Toilets,Lekki,Osapa London
3,2 Bedroom Apartment For Rent,2 BEDROOM HOUSE FOR RENT,2700000,1,1,0,2 beds,3 baths,3 Toilets,Lekki,Ologolo
4,4 Bedroom Terrace Duplex For Rent,4 BEDROOM HOUSE FOR RENT,4000000,1,0,0,4 beds,5 baths,5 Toilets,Lekki,Chevron


In [4]:
lagos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53070 entries, 0 to 53069
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         53070 non-null  object
 1   More Info     53070 non-null  object
 2   Price         53070 non-null  object
 3   Serviced      53070 non-null  int64 
 4   Newly Built   53070 non-null  int64 
 5   Furnished     53070 non-null  int64 
 6   Bedrooms      53070 non-null  object
 7   Bathrooms     53070 non-null  object
 8   Toilets       53070 non-null  object
 9   City          53070 non-null  object
 10  Neighborhood  53070 non-null  object
dtypes: int64(3), object(8)
memory usage: 4.5+ MB


In [5]:
def house_type(serie):
    final = []
    
    # Extract property name from title
    for title in list(serie):
        # convert all titles to lowercase trings
        title = str(title).lower()
        if ("land" in title) and (not "island" in title):
            final.append("land")
        elif ("flat" in title) and (not "mini" in title):
            final.append("Flat")
        elif ("self" in title) or ("mini" in title):
            final.append("Self-contained")
        elif ("apartment" in title) or ("tment" in title):
            final.append("Apartment")
        elif "semi" in title:
            final.append("Semi Detached Duplex")
        elif "terra" in title:
            final.append("Terraced Duplex")
        elif ('fully detached' in title) or ('bedroom detached duplex' in title) and (not "semi" in title):
            final.append("Fully Detached Duplex")
        elif ('bedroom' in title) or ('bed room' in title) and (not "semi" in title) and (not "terra" in title) and (not "apartment" in title) and (not "flat" in title):
            final.append("Fully Detached Duplex")
        elif ('detached' in title)  and (not "semi" in title)  and (not "terra" in title):
            final.append("Fully Detached Duplex")
        elif ('duplex' in title)  and (not "semi" in title)  and (not "terra" in title):
            final.append("Fully Detached Duplex")
        else:
            final.append("others")
    return final

In [6]:
lagos["Property Type"] = house_type(lagos["Title"])

In [7]:
lagos["Property Type"].value_counts()

Flat                     17204
Self-contained           11201
Fully Detached Duplex     8071
Apartment                 7171
Terraced Duplex           4514
Semi Detached Duplex      3465
others                    1305
land                       139
Name: Property Type, dtype: int64

In [8]:
mask = lagos.loc[(lagos["Property Type"]=="others") | (lagos["Property Type"]=="land")].index
lagos.drop(mask, inplace=True)
lagos.drop(["Title", "More Info"], axis=1, inplace=True)

In [9]:
lagos["Property Type"].value_counts()

Flat                     17204
Self-contained           11201
Fully Detached Duplex     8071
Apartment                 7171
Terraced Duplex           4514
Semi Detached Duplex      3465
Name: Property Type, dtype: int64

In [10]:
# Remove the last item from the price
def price_split(feature):
    final = []
    for value in feature:
        # if value has a price period, extract it else let it's price period be fixed
        try:
            final.append(value.split("/")[1])
        except:
            final.append("year")
    return final

In [11]:
# applying function
lagos["Price Period"] = price_split(lagos["Price"])

In [12]:
def comma_remove(feature):
    final = []
    for sample in feature:
        # Get the prices
        sample = sample.split("/")[0]
        
        # Remove the commas
        if "," in sample:
            final.append(sample.replace(",",""))
        else:
            final.append(sample)
    
    # Convert them to numbers
    int_final = [float(item) for item in final]
    return int_final


In [13]:
# applying functions
lagos["Price"] = comma_remove(lagos["Price"])

In [14]:
lagos = lagos[lagos["Price Period"] == "year"].copy()
lagos.drop("Price Period", axis=1, inplace=True)

In [15]:
def remove_bed_strip(serie):
    final = []
    # If value doesn't have a number, replace it with a zero,
    # else replace it with the number it contains
    for value in serie:
        if value.split("beds")[0].strip() == "":
            final.append(int("0"))
        else:
            final.append(int(value.split("beds")[0].strip()))
    return final

def remove_toilet_strip(serie):
    final = []
    for value in serie:
        if value.split("Toilets")[0].strip() == "":
            final.append(int("0"))
        else:
            final.append(int(value.split("Toilets")[0].strip()))
    return final 

def remove_bath_strip(serie):
    final = []
    for value in serie:
        if value.split("baths")[0].strip() == "":
            final.append(int("0"))
        else:
            final.append(int(value.split("baths")[0].strip()))
    return final 

Applying functions to three features

In [16]:
lagos["Bedrooms"] = remove_bed_strip(lagos["Bedrooms"])

In [17]:
lagos["Bathrooms"] = remove_bath_strip(lagos["Bathrooms"])

In [18]:
lagos["Toilets"] = remove_toilet_strip(lagos["Toilets"])

In [19]:
lagos[["Bedrooms","Bathrooms","Toilets"]].sample(5)

Unnamed: 0,Bedrooms,Bathrooms,Toilets
44651,3,3,4
16657,2,0,0
48713,1,2,2
11349,1,0,0
17923,4,4,5


In [20]:
def first_impute(f1, f2):
    final = []
    for value in zip(f1, f2):
        if (value[0] == 0):
            final.append(value[1])
        else:
            final.append(value[0])
    return final

In [21]:
def second_impute(f1, f2):
    final = []
    for value in zip(f1, f2):
        if value[0] > 1 :
            if (value[0] == 0):
                final.append(value[1])
            else:
                final.append(value[0])
        else:
            if (value[0] == 0):
                final.append(1)
            else:
                final.append(value[0])
    return final

In [22]:
lagos.shape

(51435, 10)

In [23]:
lagos["Bedrooms"] = first_impute(lagos["Bedrooms"], lagos["Bathrooms"])
lagos["Bathrooms"] = first_impute(lagos["Bathrooms"], lagos["Bedrooms"])
lagos["Toilets"] = second_impute(lagos["Toilets"], lagos["Bathrooms"])

In [24]:
# try missing room imputation again here
mask = lagos.loc[(lagos['Bedrooms'] == 0) | (lagos['Bedrooms'] == 0) | (lagos['Toilets'] == 0)].index
lagos.drop(mask, inplace=True)


In [25]:
from feature_engine.outliers import Winsorizer

In [26]:
lagos["Price"].median() /1e6, lagos["Price"].mean()/1e6

(2.0, 47.357957401211024)

In [27]:
# # try price outlier handling and missing room imputation again here
mask = lagos.loc[(lagos["Price"] < 1e5) | (lagos["Price"] > 5e7)].index
lagos.drop(mask, inplace=True)

In [28]:
lagos["Price"].median() /1e6, lagos["Price"].mean()/1e6

(2.0, 3.67481677367905)

## Data Preprocessing

#### Data Split

Splitting our data before data preprocessing to avoid data leakage

In [29]:
# Spliting to extract 20% test data
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(lagos, lagos["Neighborhood"]):
    train_set = lagos.iloc[train_index]
    test_set = lagos.iloc[test_index]

In [30]:
lagos = train_set.drop("Price", axis=1)
price = train_set["Price"]

Let's seperate our numerical columns and our categorical ones

In [31]:
categorical_col = [column for column in lagos.columns if lagos[column].dtype == "O"]

In [32]:
lagos[categorical_col].sample(4)

Unnamed: 0,City,Neighborhood,Property Type
52060,Yaba,Other Yaba,Flat
29507,Island,Oniru,Terraced Duplex
114,Lekki,Chevron,Semi Detached Duplex
17626,Lekki,Ikota,Self-contained


In [33]:
lagos = lagos.sample(frac=1, random_state=51)

#### Feature Encoding

We need to perform OneHotEncoding on some of the norminal categorical variables (variables with no order e.g. Nigeria, Congo, Ghana)

In [34]:
# import library for encoding
from feature_engine.encoding import OneHotEncoder

In [35]:
encoder = OneHotEncoder()
encoder.fit(lagos)
lagos = encoder.transform(lagos)

Saving Encoder for future use in model deployment

In [36]:
joblib.dump(encoder, "tools/encoder_joblib")

['tools/encoder_joblib']

In [37]:
lagos

Unnamed: 0,Serviced,Newly Built,Furnished,Bedrooms,Bathrooms,Toilets,City_Lekki,City_Ajah,City_Yaba,City_Ikoyi,...,Neighborhood_Ligali Ayorinde,Neighborhood_Oworonshoki,Neighborhood_Awolowo Road,Neighborhood_1004,Property Type_Self-contained,Property Type_Fully Detached Duplex,Property Type_Flat,Property Type_Apartment,Property Type_Terraced Duplex,Property Type_Semi Detached Duplex
2161,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
46036,0,1,1,4,5,6,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
48890,0,0,0,3,2,3,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
22355,1,0,0,3,3,4,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
14189,0,0,0,2,2,3,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3776,0,0,0,4,4,5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5493,1,0,0,3,4,4,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
15087,1,0,1,3,3,4,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
15630,0,0,0,4,4,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Save preprocessed data for Model Building

In [38]:
lagos = pd.concat([lagos, price], axis=1)

In [39]:
lagos.to_csv('data/train.csv', index=False)
test_set.to_csv('data/test.csv', index=False)