In [1]:
# Import modules
import requests
import urllib.parse
import pandas as pd
import requests
import csv

## Get Latitude and Longitude from addresses

In [2]:
# import data
df = pd.read_csv("daft_listings.csv", encoding='cp1252')

address = df["Adress"]

lat = []
lon = []


for url in address:
    # get latitude and longitude
    get_info = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(url) +'?format=json'
    response = requests.get(get_info).json()

    if response == []: # if no result found for address
        lat.append(None)
        lon.append(None)

    else: # if result found for address
        lat.append(response[0]["lat"])
        lon.append(response[0]["lon"])
        
df["Latitude"] = lat
df["Longitude"] = lon

# add new 
df.to_csv("daft_listings_lat_lon.csv")

In [3]:
df[["Longitude", "Latitude"]].isnull().sum() # returns how

Longitude    1033
Latitude     1033
dtype: int64

In [4]:
len(df)

1729

There are a total of 1729 individual share listings. Out of those 1729 listings, 1033 addresses provided by those listings were not submitted correctly. It is suspected/ assumed that the post creators only want to provide a general location of the place in order to protect their own privacy. This assumption is valid because only when the listing creator invites one to a personal viewing, they provide the invitee with an eircode which reveals their exact location.

# Data Cleaning

In [3]:
df = pd.read_csv("daft_listings_lat_lon.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Adress,Price,Room,Bath,Property_type,Bedrooms_available,Available_from,Available_for,Sharing_with,Owner_occupied,Preferences,Description,Date_entered/renewed,Property_views,Latitude,Longitude
0,0,"Ballycasey Close, Shannon, Co. Clare",€160 per week,Single Room,Shared Bathroom,House,1,Immediately,6 Months,2,Yes,Male / Female,"DescriptionHosting Power agency, providing aff...",21/03/2023,899,,
1,1,"63 Fernleigh Drive , Carpenterstown, Dublin 15",€800 per month,Double Room,Shared Bathroom,House,1,Immediately,1 Year,2,No,Female,DescriptionTwo Double rooms available in Carpe...,10/04/2023,1308,,
2,2,"Beaufield Gardens, Maynooth, Co. Kildare",€130 per week,Double Room,Shared Bathroom,House,1,Immediately,1 Year +,4,No,Male / Female,"DescriptionJust available, Room in center of M...",13/04/2023,280,53.373441,-6.598454
3,3,"87 Old Willow Park, Athlone, Athlone, Co. West...",€400 per month,Single Room,Shared Bathroom,House,3,"May 1st, 2023",3 Months,2,No,Female,"DescriptionThis fabulous, newly refurbished pr...",13/04/2023,168,,
4,4,"62 Lanesborough Mews, Dublin 11, Dublin 11",€950 per month,Double Room,Shared Bathroom,Apartment,1,Immediately,6 Months,2,No,Male / Female,DescriptionThis spaciouse double room is in a ...,12/04/2023,1110,53.402606,-6.294961


In [5]:
df = df.rename(columns = {"Adress": "Address"}) # correct column name
df = df.drop("Unnamed: 0", axis = 1)  # remove unnecessary column

### Text Pre-processing

In [6]:
import unicodedata
from bs4 import BeautifulSoup
import re
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import contractions

nlp = spacy.load('en_core_web_md')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [7]:
# How the text in "Description" looks like
df["Description"][0]

'DescriptionHosting Power agency, providing affordable accommodation in Ireland since 2014.\n\nWe are proud to advertise this fully furnished and comfortable private room.\nThe house is equipped with all modern conveniences and you share it with your welcoming host. \n\nAll bills (electricity, gas, etc.) and WiFi are included in the rent!\nAll bedding will be provided too.\n\nHOW TO BOOK THIS ROOM?\n- For the fastest way to book this room, please go to our website\n- Alternatively, please reply to this ad with a detailed self description.\n\nRoom Ref: 22654B'

In [8]:
# https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/contractions.py

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

# https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/NLP%20Strategy%20I%20-%20Processing%20and%20Understanding%20Text.ipynb
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


In [9]:
# get rid of html tags 
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

# remove accented characters 
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

# remove special characters 
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

# remove stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Lemmatization
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text


In [12]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=False):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)          
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [13]:
df["Pre_processed_desc"] = normalize_corpus(df["Description"])

In [14]:
df["Pre_processed_desc"] = df["Pre_processed_desc"].str.slice(11) # remove the first 11 characters (more specifcally: remove the word "Description" in every text)

In [15]:
# cleaned text example
df["Pre_processed_desc"][0]

'hoste power agency provide affordable accommodation ireland since 2014 proud advertise fully furnish comfortable private room house equip modern convenience share welcome host bill electricity gas etc wifi include rent bedding provide book room fast way book room please go website alternatively please reply ad detailed self description room ref 22654b'

# Feature Engineering

##### description word length

In [16]:
df["Desc_length"] = df["Description"].apply(lambda x: len(x))

##### Adjust price to price per month

the rent price is not standarized. The listing creator can choose to display the price for a room per week or per month. So in this party the prices will be adjusted to display the rent price per month.

In [17]:
df["Price"][0:6] # not standarized

0     €160 per week
1    €800 per month
2     €130 per week
3    €400 per month
4    €950 per month
5    €900 per month
Name: Price, dtype: object

In [18]:
# function that standarizes the prices to price per month
def adjust_price(text,string):
    
    # convert price per week to price per month
    if string in text:
        for i in text.split():
            i = i.strip("€") # remove Euro symbol
            if i.isnumeric():
                price_per_month = int(i) * 4 
                return price_per_month
                
    # if price is already displayed as price per month, then only extract the price
    else:
        for i in text.split():
            i = i.strip("€") # remove Euro symbol
            if i.isnumeric():
                return i 

In [19]:
# apply function
df["Price_per_month"] = df["Price"].apply(lambda x: adjust_price(x, "week"))

In [20]:
df.head()

Unnamed: 0,Address,Price,Room,Bath,Property_type,Bedrooms_available,Available_from,Available_for,Sharing_with,Owner_occupied,Preferences,Description,Date_entered/renewed,Property_views,Latitude,Longitude,Pre_processed_desc,Desc_length,Price_per_month
0,"Ballycasey Close, Shannon, Co. Clare",€160 per week,Single Room,Shared Bathroom,House,1,Immediately,6 Months,2,Yes,Male / Female,"DescriptionHosting Power agency, providing aff...",21/03/2023,899,,,hoste power agency provide affordable accommod...,551,640
1,"63 Fernleigh Drive , Carpenterstown, Dublin 15",€800 per month,Double Room,Shared Bathroom,House,1,Immediately,1 Year,2,No,Female,DescriptionTwo Double rooms available in Carpe...,10/04/2023,1308,,,two double room available carpenterstown local...,784,800
2,"Beaufield Gardens, Maynooth, Co. Kildare",€130 per week,Double Room,Shared Bathroom,House,1,Immediately,1 Year +,4,No,Male / Female,"DescriptionJust available, Room in center of M...",13/04/2023,280,53.373441,-6.598454,just available room center maynooth centrally ...,364,520
3,"87 Old Willow Park, Athlone, Athlone, Co. West...",€400 per month,Single Room,Shared Bathroom,House,3,"May 1st, 2023",3 Months,2,No,Female,"DescriptionThis fabulous, newly refurbished pr...",13/04/2023,168,,,this fabulous newly refurbished property old w...,405,400
4,"62 Lanesborough Mews, Dublin 11, Dublin 11",€950 per month,Double Room,Shared Bathroom,Apartment,1,Immediately,6 Months,2,No,Male / Female,DescriptionThis spaciouse double room is in a ...,12/04/2023,1110,53.402606,-6.294961,this spaciouse double room 3 bed room apartmen...,438,950


##### Add Irish Counties column

In [21]:
Irish_counties = ["Tyrone", "Londonderry","Antrim", "Fermanagh", "Armagh", "Down", "Donegal", "Sligo", "Leitrim", "Cavan", "Louth", "Monaghan", "Mayo", "Roscommon", "Longford", "Meath", "Westmeath", "Galway", "Offaly", "Kildare", "Wicklow", "Clare", "Tipperary", "Laois", "Carlow", "Kilkenny", "Wexford", "Limerick", "Kerry", "Cork", "Waterford"]

In [6]:
# function that extracts only the counties from the addresses. However, if the county is Dublin, the postal code will be extracted
def extract_counties(text):
    
    # extract county
    for i in text.split():
        if i in Irish_counties:
            return i

    # if county is Dublin, get postal code
    for i, j in zip(text.split(), text.split()[1:]): # for-loop for current item i and next item j
        if (i == "Dublin") & (j.isnumeric()): # if county is Dublin, gets postal code
            dub_postal = i + " " + j
            return dub_postal

In [23]:
df["County"] = df["Address"].apply(lambda x : extract_counties(x))

In [24]:
df.head()

Unnamed: 0,Address,Price,Room,Bath,Property_type,Bedrooms_available,Available_from,Available_for,Sharing_with,Owner_occupied,Preferences,Description,Date_entered/renewed,Property_views,Latitude,Longitude,Pre_processed_desc,Desc_length,Price_per_month,County
0,"Ballycasey Close, Shannon, Co. Clare",€160 per week,Single Room,Shared Bathroom,House,1,Immediately,6 Months,2,Yes,Male / Female,"DescriptionHosting Power agency, providing aff...",21/03/2023,899,,,hoste power agency provide affordable accommod...,551,640,Clare
1,"63 Fernleigh Drive , Carpenterstown, Dublin 15",€800 per month,Double Room,Shared Bathroom,House,1,Immediately,1 Year,2,No,Female,DescriptionTwo Double rooms available in Carpe...,10/04/2023,1308,,,two double room available carpenterstown local...,784,800,Dublin 15
2,"Beaufield Gardens, Maynooth, Co. Kildare",€130 per week,Double Room,Shared Bathroom,House,1,Immediately,1 Year +,4,No,Male / Female,"DescriptionJust available, Room in center of M...",13/04/2023,280,53.373441,-6.598454,just available room center maynooth centrally ...,364,520,Kildare
3,"87 Old Willow Park, Athlone, Athlone, Co. West...",€400 per month,Single Room,Shared Bathroom,House,3,"May 1st, 2023",3 Months,2,No,Female,"DescriptionThis fabulous, newly refurbished pr...",13/04/2023,168,,,this fabulous newly refurbished property old w...,405,400,Westmeath
4,"62 Lanesborough Mews, Dublin 11, Dublin 11",€950 per month,Double Room,Shared Bathroom,Apartment,1,Immediately,6 Months,2,No,Male / Female,DescriptionThis spaciouse double room is in a ...,12/04/2023,1110,53.402606,-6.294961,this spaciouse double room 3 bed room apartmen...,438,950,Dublin 11


##### Count adjectives

In [25]:
# get english adjectives

# text file from : https://gist.github.com/Xeoncross/5381806b18de1f395187
my_file = open("Positive-Adjective-List.txt", "r") 
data = my_file.read()
positive_adjectives = data.split("\n")
my_file.close()

# text file from : https://gist.github.com/hugsy/8910dc78d208e40de42deb29e62df913
my_file = open("english-adjectives.txt", "r") 
data = my_file.read()
english_adjectives = data.split("\n")
my_file.close()

In [26]:
adjectives = english_adjectives + positive_adjectives

In [34]:
def count_adjectives(text, adjective_list):
    counts = 0
    for i in text.split(): 
        if i in adjective_list:
            counts = counts + 1
    return counts

In [35]:
df["adjective_counts"] = df["Pre_processed_desc"].apply(lambda x: count_adjectives(x, adjectives))

In [38]:
df.head()

Unnamed: 0,Address,Price,Room,Bath,Property_type,Bedrooms_available,Available_from,Available_for,Sharing_with,Owner_occupied,...,Description,Date_entered/renewed,Property_views,Latitude,Longitude,Pre_processed_desc,Desc_length,Price_per_month,County,adjective_counts
0,"Ballycasey Close, Shannon, Co. Clare",€160 per week,Single Room,Shared Bathroom,House,1,Immediately,6 Months,2,Yes,...,"DescriptionHosting Power agency, providing aff...",21/03/2023,899,,,hoste power agency provide affordable accommod...,551,640,Clare,8
1,"63 Fernleigh Drive , Carpenterstown, Dublin 15",€800 per month,Double Room,Shared Bathroom,House,1,Immediately,1 Year,2,No,...,DescriptionTwo Double rooms available in Carpe...,10/04/2023,1308,,,two double room available carpenterstown local...,784,800,Dublin 15,10
2,"Beaufield Gardens, Maynooth, Co. Kildare",€130 per week,Double Room,Shared Bathroom,House,1,Immediately,1 Year +,4,No,...,"DescriptionJust available, Room in center of M...",13/04/2023,280,53.373441,-6.598454,just available room center maynooth centrally ...,364,520,Kildare,5
3,"87 Old Willow Park, Athlone, Athlone, Co. West...",€400 per month,Single Room,Shared Bathroom,House,3,"May 1st, 2023",3 Months,2,No,...,"DescriptionThis fabulous, newly refurbished pr...",13/04/2023,168,,,this fabulous newly refurbished property old w...,405,400,Westmeath,5
4,"62 Lanesborough Mews, Dublin 11, Dublin 11",€950 per month,Double Room,Shared Bathroom,Apartment,1,Immediately,6 Months,2,No,...,DescriptionThis spaciouse double room is in a ...,12/04/2023,1110,53.402606,-6.294961,this spaciouse double room 3 bed room apartmen...,438,950,Dublin 11,9


##### remove unnecessary columns

In [39]:
df = df.drop(["Price", "Description"], axis = 1)

In [40]:
df.head()

Unnamed: 0,Address,Room,Bath,Property_type,Bedrooms_available,Available_from,Available_for,Sharing_with,Owner_occupied,Preferences,Date_entered/renewed,Property_views,Latitude,Longitude,Pre_processed_desc,Desc_length,Price_per_month,County,adjective_counts
0,"Ballycasey Close, Shannon, Co. Clare",Single Room,Shared Bathroom,House,1,Immediately,6 Months,2,Yes,Male / Female,21/03/2023,899,,,hoste power agency provide affordable accommod...,551,640,Clare,8
1,"63 Fernleigh Drive , Carpenterstown, Dublin 15",Double Room,Shared Bathroom,House,1,Immediately,1 Year,2,No,Female,10/04/2023,1308,,,two double room available carpenterstown local...,784,800,Dublin 15,10
2,"Beaufield Gardens, Maynooth, Co. Kildare",Double Room,Shared Bathroom,House,1,Immediately,1 Year +,4,No,Male / Female,13/04/2023,280,53.373441,-6.598454,just available room center maynooth centrally ...,364,520,Kildare,5
3,"87 Old Willow Park, Athlone, Athlone, Co. West...",Single Room,Shared Bathroom,House,3,"May 1st, 2023",3 Months,2,No,Female,13/04/2023,168,,,this fabulous newly refurbished property old w...,405,400,Westmeath,5
4,"62 Lanesborough Mews, Dublin 11, Dublin 11",Double Room,Shared Bathroom,Apartment,1,Immediately,6 Months,2,No,Male / Female,12/04/2023,1110,53.402606,-6.294961,this spaciouse double room 3 bed room apartmen...,438,950,Dublin 11,9


## Save final dataset

In [41]:
df.to_csv(r'final_daft_listings.csv', index = False)