# Libraries

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict

from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import re
import pickle

from wordcloud import WordCloud
#pd.set_option('display.max_colwidth', -1)

After web scrapping we created one single data frame with columns 'price', 'locali', 'superficie', 'bagni', 'piano', 'description' and stored it as a .csv file for future usage. 

In [2]:
#Importing data into memory from the stored .csv file
data = pd.read_csv("data_houses.csv", sep='\t', encoding='utf-8')
data.columns

Index(['price', 'locali', 'superficie', 'bagni', 'piano', 'description'], dtype='object')

In [3]:
#The number of rows before cleaning
data.shape

(23203, 6)

In [4]:
#Checking null values of the dataset
data.isnull().sum()

price             1
locali          354
superficie       24
bagni           491
piano          3793
description    5045
dtype: int64

In [5]:
#Checking data types of the dataset
data.dtypes

price          object
locali         object
superficie     object
bagni          object
piano          object
description    object
dtype: object

In [6]:
data.loc[:,['price','locali','superficie','bagni','piano']].describe()

Unnamed: 0,price,locali,superficie,bagni,piano
count,23202,22849,23179,22712,19410
unique,2335,6,587,315,15
top,€ 199.000,3,90,1,1
freq,338,7177,962,10007,4236


In [7]:
#The values where the number of bathrooms are 1, 2 or 3 should be kept because they are the most common ones and other numbers 
#are rare and therefore considered outliers(they are probably buildings_?)
#also 3+ is eliminated cause it has ambigious meaning

In [8]:
#We decided to drop the rows where the value of the floor is 11+ because there aren't many of those rows and they don't show
data.piano.value_counts()
#eliminate R, A, S and 11+  cause it is ambigious

1      4236
2      3308
T      2891
3      2419
A      1652
4      1570
R      1008
5       924
6       463
S       381
7       350
8       134
9        32
11+      26
10       16
Name: piano, dtype: int64



After _analyzing_ the possible values of each attribute of the dataset we decided the following about the process of 
**cleaning** the dataset before making two separate datasets for clustering.
    
The data cleaning process:
    1. Remove nan values
    2. Attribute price should be converted to integer data type and '€' removed from the values
    3. In attribute locali remove rows where the value of locali is 5+ and convert attribute to integer data type
    4. Attribute superficie has some strange values because of the scrapping, like dates '14/02/19' '18/01/19' and all the values that don't make sense should be removed and the attribute should be converted to the integer data type
    5. For attribute piano replace T->0 and drop the rows with piano values A, R, S and 11+ cause they are ambiguous
    
The goal of the cleaning process was to eliminate all the ambiguous values and categorical values that can't have a meaningful numerical representation.

In [9]:
def clean_data(data):
    """
    Method that removes nan values and cleanes the data
    
    Input: dataframe
    Output: cleaned dataframe
    """
    
    #Remove rows where there aren't all values present
    data.dropna(inplace=True)

    #try to convert price to int and remove € 
    for i in data.index:
    #Becasuse of the web scrapping some prices had strings and text in this way we dealt with it if exception appears
    #'da' is because some prices were scrapped with the word 'da' before the price
        try:
            data.price[i]=int(data.price[i].replace('€', '').strip().replace('.','').replace('da',''))
    #before word class there is price we need
        except:
            try:
                data.price[i]=int(data.price[i].split('class')[0].replace('€', '').strip().replace('.',''))
           #in case of the bad scrapping value (e.g.just text) just drop those rows
            except:
                data.drop(i,inplace=True)
            
    #convert to int superficie and remove / and . in order to convert to int normally
    data.superficie=data.superficie.replace('[/]', '', regex=True).apply(lambda x: int(str(x).replace('.','')))
    
    #Drop rows where values of locali is 5+ and convert to int
    #strip is because of the web scrapping process which took whitespace
    data['locali']=data['locali'].apply(lambda x: x.strip())
    data.drop(data[ data['locali']=='5+'].index,inplace=True)


    #drop A, R, S and 11+  cause it is ambigious 
    data.drop(data[(data['piano']=='A') | (data['piano']=='R') | ( data['piano']=='S')| ( data['piano']=='11+')].index,inplace=True)
    #Attribute piano replace T->0 cause piano T is 'terro' which means it is floor 0
    data['piano']=np.where(data['piano']=='T', 0, data['piano'])     
    
    #remove whitespace from the values
    data.bagni=data.bagni.apply(lambda x: x.strip())
    #remove all the rows where the value of bagni is not 1,2,3...like 3+, cause other values don't have so many value counts
    data.drop(data[( data['bagni']!='1')&( data['bagni']!='2')&( data['bagni']!='3')].index,inplace=True)    

    #convert type of "price","locali","piano", "bagni" to numeric instead of object
    data[["price","locali","piano", "bagni"]] = data[["price","locali","piano", "bagni"]].apply(pd.to_numeric)
    
    #reset index so it starts from 0 to last row number not with scpapped number indices, but with consistent range
    data.reset_index(inplace=True)
    data.drop('index',axis=1,inplace=True)
    
    return data

In [10]:
cleaned_data=clean_data(data)

In [11]:
cleaned_data.index

RangeIndex(start=0, stop=10773, step=1)

In [34]:
cleaned_data.head(1)

Unnamed: 0,price,locali,superficie,bagni,piano,description
0,225000,2,50,1,1,"papillo eur\r\r\n PAPILLO EUR in elegante complesso residenziale rifinitissimo bilocale composto da soggiorno con angolo cottura, stanza da letto bagno e ampio balcone . con Rifiniture di pregio, pavimenti in parquet / grees, infissi in legno con vetro camera e porte in noce, grate nel salone, riscaldamento termoautonomo con caldaia centralizzata, aria condizionata, videocitofono, porta blindata, serramenti elettrici con chiusura centralizzata, antenna satellitare, isolamento termo acustico, pannelli solari e fotovoltaici , rilevatori elettronici di gas. Tutte le camere sono fornite di impianto antifurto, presa antenna satellitare e presa telefonica.div\r\r\ndiv\r\r\nORARI lunedi chiusidiv\r\r\n martedi 10:00-17:00div\r\r\n mercoledi 10:00-17:00div\r\r\ndivgiovedi 10:00-17:00div\r\r\ndivvenrdi 10:00-17:00div\r\r\ndivsabato 10:00-17:00div\r\r\n domenica 10:00-13:00\r\r\n"


In [14]:
#The number of rows after cleaning the dataset
#10773
cleaned_data.shape

(10773, 6)

In [15]:
#After cleaning the data(dropping rows where there aren't some values) there aren't any NaN values present
cleaned_data.isnull().sum()

price          0
locali         0
superficie     0
bagni          0
piano          0
description    0
dtype: int64

In [16]:
cleaned_data.dtypes

price           int64
locali          int64
superficie      int64
bagni           int64
piano           int64
description    object
dtype: object

# Datasets

Based on the single data frame we extracted using the web scrapping process we created the Description and the Information datasets

## Description dataset

(Description dataset)

    columns: description 

In [17]:
description_dataset=pd.DataFrame(cleaned_data.description)
pd.set_option('display.max_colwidth', -1)
description_dataset.head(1)

Unnamed: 0,description
0,"papillo eur\r\r\n PAPILLO EUR in elegante complesso residenziale rifinitissimo bilocale composto da soggiorno con angolo cottura, stanza da letto bagno e ampio balcone . con Rifiniture di pregio, pavimenti in parquet / grees, infissi in legno con vetro camera e porte in noce, grate nel salone, riscaldamento termoautonomo con caldaia centralizzata, aria condizionata, videocitofono, porta blindata, serramenti elettrici con chiusura centralizzata, antenna satellitare, isolamento termo acustico, pannelli solari e fotovoltaici , rilevatori elettronici di gas. Tutte le camere sono fornite di impianto antifurto, presa antenna satellitare e presa telefonica.div\r\r\ndiv\r\r\nORARI lunedi chiusidiv\r\r\n martedi 10:00-17:00div\r\r\n mercoledi 10:00-17:00div\r\r\ndivgiovedi 10:00-17:00div\r\r\ndivvenrdi 10:00-17:00div\r\r\ndivsabato 10:00-17:00div\r\r\n domenica 10:00-13:00\r\r\n"


## Information dataset

(Information dataset)

    columns: price, locali, superficie, bagni, piano

In [18]:
information_dataset=cleaned_data.drop(['description'],axis=1)
information_dataset.head(5)

Unnamed: 0,price,locali,superficie,bagni,piano
0,225000,2,50,1,1
1,339000,3,90,1,4
2,480000,4,125,2,4
3,135000,2,60,1,5
4,249000,2,75,1,1


In [19]:
#In this way we can reuse it and not repeat cleaning process every time we execute the notebook

In [20]:
#Save information and description datasets to two separate .csv files

In [21]:
information_dataset.to_csv('information_dataset.csv')

In [22]:
description_dataset.to_csv('description_dataset.csv')

In [23]:
#Make vocabulary

In [24]:
def preprocessing_text(df):
    """
    Method that returns filtered words from the text input 
    
    Input: string(text)
    Output: list(bag of words)
    """  
    #remove upper cases
    df=df.lower()
   
    #replacing new line sign '\n' '\r' and 'div' from html with a whitespace ' '    
    df=df.replace('\\n',' ').replace('\\r',' ').replace('div',' ')
    
    #for removing stop words
    stop_words = set(stopwords.words('italian')) 
    stop_words.add('div')
    stop_words.add('n')
    stop_words.add('b')

    #remove numbers
    df = re.sub("\d+", "", df)
    
    #for removing punctuations
    tokenizer = RegexpTokenizer(r'\w+')
    
    #to tokenize the string
    word_tokens = tokenizer.tokenize(df)     

    #stemming
    ps = PorterStemmer()
    filtered_words = [ps.stem(w) for w in word_tokens if not w in stop_words] 

    return filtered_words

In [None]:
def build_vocabulary(df):
    """
    Method that creates vocabulary
    
    Input: dataframe
    Output: vocabulary list 
    """  
    #list for vocabulary 
    vocabulary_lst=[]
    #preprocessing description and get res->list of lists, where each list i a list of filtered preproccesed words 
    res=df.description.apply(lambda x: preprocessing_text(x))   
    
    #vocabulary_set is a set used for making vocabulary with unique words

    vocabulary_set = set(res[0]).union(*res[1:])

    
    #mapping words into integers
    vocabulary={} 
    for k,v in enumerate(vocabulary_set):
        vocabulary[v]= k
    return res,vocabulary

In [None]:
res,vocabulary=build_vocabulary(description_dataset)

In [None]:
len(res[0])

In [None]:
len(vocabulary)

In [None]:
#Saving vocabulary as a dictionary into a "vocabulary.p" (pickle) file

#21370 what else should we eliminate from the words??
pickle.dump(vocabulary, open("vocabulary.p", "wb"))

In [27]:
#To load it into memory from file
vocabulary = pickle.load(open("vocabulary.p", "rb"))

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
every row is a separate house(which is considered as a separate document)

In [None]:
####NEED TO change and finish it in ORDER TO CALCULATE TF_IDF VALUES 
def compute_inverted_idx(res,vocabulary):
    """
    method that computes an inverted index
    
    input: res(list of lists), vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: inverted_idx(dictionary, key=term_id, value=list of document_ids(rows)) 
    """
    #initialize defaultdict for making an inverted index
    inverted_idx = defaultdict(list)
    #in every document look for every word and assign document id to the words which belong to it
    for idx,lst in enumerate(res):
        lst=((pd.Series(lst).value_counts())/len(lst)).sort_index()

        #lst=set(lst)
        for i,tf in enumerate(lst):
            inverted_idx[lst.index[i]].append((idx,tf))
    return inverted_idx

In [None]:
inverted_idx=compute_inverted_idx(res,vocabulary)

In [None]:
len(inverted_idx)
#dictionary->key=term_id,  value= (document_id,tf_value)

In [35]:
len(inverted_idx['saba'])

15

In [36]:
inverted_idx['saba']

[(133, 0.009259259259259259),
 (789, 0.011560693641618497),
 (1784, 0.020618556701030927),
 (2166, 0.009433962264150943),
 (2833, 0.012195121951219513),
 (3074, 0.006535947712418301),
 (3098, 0.008130081300813009),
 (3132, 0.02531645569620253),
 (3813, 0.012048192771084338),
 (4098, 0.037037037037037035),
 (6435, 0.03125),
 (6635, 0.008130081300813009),
 (8861, 0.04054054054054054),
 (8921, 0.017241379310344827),
 (10079, 0.020618556701030927)]

In [None]:
#Saving inverted_idx as a dictionary into a "inverted_idx.p" (pickle) file
pickle.dump(inverted_idx, open("inverted_idx.p", "wb"))

In [None]:
#dictionary->key=doc_id,word,  value=tfidf---. sequentially for each word in vocab
tf_idf_dic={}
total_num_docs=description_dataset.shape[0]

    #dictionary->key=term_id, value= (document_id,tf_value)
for term,tup_pair in inverted_idx.items():
    for doc_id,tf_value in tup_pair:
        tf_idf_dic[(doc_id,term)]= np.log(total_num_docs/len(inverted_idx[term]))*tf_value

In [None]:
#Saving inverted_idx as a dictionary into a "inverted_idx.p" (pickle) file

pickle.dump(tf_idf_dic, open("tf_idf_dic.p", "wb"))

In [None]:
len(tf_idf_dic)

In [None]:
tf_idf_dic[0,'gree']

In [31]:
description_dataset=pd.DataFrame(columns=list(vocabulary.keys()))
for row in range(data.shape[0]):
    temp_row_values=[]
    for word in vocabulary:
        try:
            temp_row_values.append(tf_idf_dic[(doc_id,word)])
        except:
            temp_row_values.append(0)
    description_dataset[row]=temp_row_values

MemoryError: 

In [None]:
#TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
#IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

def calculate_tf_idf(description_dataset,inverted_idx,vocabulary):
    """
    method that calculates tf-idf values
     
    input:  inverted_idx(dictionary, key=term_id, value=list of document_ids)
            vocabulary(dictionary of all unique words, key=term, value=term_id)
    output: tf_idf_dic(dictionary of tf_idf_values for all rows(docs), key=tuple(term,doc_id), value=tf_idf value)
    """ 
    
    return tf_idf_dic

In [25]:
#To load it into memory from file
inverted_idx = pickle.load(open("inverted_idx.p", "rb"))

# Clustering

This step consists in _clustering the house announcements_ using **K-means++** and choosing the **optimal** number of clusters using the **Elbow-Method**.

In [None]:
information_dataset=pd.read_csv('information_dataset.csv')

In [None]:
#should we normalize?

In [None]:
# Information dataset clustering

In [None]:
from sklearn import cluster
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
X=information_dataset.loc[:,['price','locali','superficie','bagni','piano']]
distorsions = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k,init='k-means++')
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(1, 11), distorsions)
plt.grid(True)
plt.title('Elbow curve')
# Inertia: Sum of distances of samples to their closest cluster center

We chose blabla clusters:
Becausee

In [None]:
k=3
kmeans = KMeans(n_clusters=k,init='k-means++')
kmeans.fit(X)

In [None]:
#centroids are:
kmeans.cluster_centers_

In [None]:
information_dataset.columns

In [None]:
cluster_representatives=pd.DataFrame(kmeans.cluster_centers_, columns= information_dataset.columns[1:])
cluster_representatives

In [None]:
a=cluster_representatives.melt()
b=a.value
b1=a.variable
plt.plot(b,b1,'o')

In [None]:
kmeans.cluster_centers_.shape[0]
labels=kmeans.labels_
centers = np.array(kmeans.cluster_centers_)

In [None]:
centers

In [None]:
#price centroids
centers[:,0]

In [None]:
number_of_clusters=kmeans.cluster_centers_.shape[0]

#centroidi klastera


#Centroid's visualization
#for price and superficie
centers = np.array(kmeans.cluster_centers_)
plt.scatter(centers[:,0], centers[:,2], marker="x", color='r')

In [None]:
# Description dataset clustering