In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from sklearn import preprocessing
from sklearn.feature_extraction.text import (CountVectorizer, 
                                             TfidfVectorizer)
from sklearn.model_selection import (train_test_split, 
                                     cross_validate, 
                                     GridSearchCV)
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
import random
import pymongo
import string
pd.options.display.max_columns = 35

In [6]:
df = pd.read_csv('../data/housing-data.csv')

Good to note that this data can be retrieved from MongoDB

In [3]:
# mc = pymongo.MongoClient()
# db = mc['housing-recommender']
# training = db['training']

This is the word tokenizing method - not going to use

In [15]:
sent_tokens = [sent_tokenize(str(desc).lower()) for desc in df['DESC']]

sent_tokens

tokens = [[word_tokenize(sent) for sent in desc] for desc in sent_tokens]

stopwords_ = set(stopwords.words('english'))

punctuation_ = set(string.punctuation)

def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

tokens_filtered = [[filter_tokens(sent) for sent in desc] for desc in tokens]

stemmer_porter = PorterStemmer()
tokens_stemporter = [[list(map(stemmer_porter.stem, sent)) for sent in desc] for desc in tokens_filtered]
print("--- sentence tokens (porter): {}".format(tokens_stemporter[0]))

stemmer_snowball = SnowballStemmer('english')
tokens_stemsnowball = [[list(map(stemmer_snowball.stem, sent)) for sent in desc] for desc in tokens_filtered]
print("--- sentence tokens (snowball): {}".format(tokens_stemsnowball[0]))

def join_sent_ngrams(input_tokens, n):
    # first add the 1-gram tokens
    ret_list = list(input_tokens)
    
    #then for each n
    for i in range(2,n+1):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    
    return(ret_list)

snowball = SnowballStemmer('english')

sent_tokens = []
word_tokens = []
filtered_tokens = []
for idx, row in df.iterrows():
    sent_tokens.append([sent_tokenize(desc) for desc in row['DESC']])
    word_tokens.append([[word_tokenize(sent.lower()) for sent in desc] for desc in sent_tokens[idx]])
    filtered_tokens.append([filter_tokens(sent) for sent in tokens_lower[idx]])
snowball_lst = [[list(map(snowball.stem, sent)) for sent in desc] for desc in filtered_tokens]

--- sentence tokens: [['cool', 'well', 'cared', 'remodeled', 'unit', 'gated', 'cedar', 'west', 'community'], ['granite', 'counter-tops', 'stainless', 'steel', 'appliances', 'new', 'carpeting', 'bamboo', 'hardwoods', 'patio', 'storage', 'low', 'dues', 'include', 'water/sewer/garbage/ins'], ['full', 'size', 'washer', 'dryer', 'appliances', 'included'], ['plenty', 'additional', 'parking', 'secure', 'gated', 'community', 'close', 'boeing', 'kasch', 'park', 'walter', 'hall', 'golf', 'course', 'mukilteo', 'ferry', 'easy', 'i-5', 'access']]
--- sentence tokens: [['gorgeous', 'views', 'sound', 'large', 'lot', 'w/', 'detached', 'garage'], ['highlights', 'include', 'gleaming', 'wood', 'floors', 'cozy', 'fireplace', 'spacious', 'beds'], ['updated', 'kitchen', 'boasts', 'island', 'w/', 'bar', 'seating', 'granite', 'countertops', 'large', 'windows', 'natural', 'light', 'stunning', 'views'], ['updated', 'fully', 'finished', 'basement', 'includes', 'master', 'bed', 'offers', 'ensuite', 'bathroom', 'w

NameError: name 'tokens_lower' is not defined

In [None]:
from nltk.stem import WordNetLemmatizer

In [81]:
le = preprocessing.LabelEncoder()
le.fit(df['DESC'])

LabelEncoder()

In [85]:
corpus = [row['DESC'] for idx, row in df.iterrows()]

In [121]:
tf = CountVectorizer()

document_tf_matrix = tf.fit_transform(corpus).todense()

print(sorted(tf.vocabulary_))
print(document_tf_matrix)

tfidf = TfidfVectorizer()
document_tfidf_matrix = tfidf.fit_transform(corpus)
print(sorted(tfidf.vocabulary_))
print(document_tfidf_matrix.todense())

document_tfidf_matrix.todense().shape

df['TFIDF'] = pd.Series(np.array(document_tfidf_matrix.todense()).reshape(-1))

df['TFIDF'].value_counts()

In [142]:
y = le.transform(df['DESC'])
# y = df['category'].values
X = df[['DESC']]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# data_train, data_test = df[train_index], df[test_index]
tfidf = TfidfVectorizer()
tfidf.fit(X_train['DESC'].values)

X_train_matrix = tfidf.transform(X_train['DESC'].values)

X_test_matrix = tfidf.transform(X_test['DESC'].values)

X_train_matrix.shape

X_train_matrix.todense()

X_test_matrix.shape

X_train_matrix.todense().shape

mnb = MultinomialNB(alpha=1)
mnb.fit(X_train_matrix.todense(), y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [148]:
mnb.predict(X_test_matrix.todense())

array([266, 330, 341, 312, 216, 331,   2, 115, 296, 266, 153,  39, 309,
       219, 230,  15,  39, 151,  39, 146, 153, 283, 248, 337,  38, 325,
       183, 141, 284,  64, 105, 205, 235,  37,  14, 275,  59, 120,  67,
       299, 342, 134, 230, 216, 104, 325, 258, 303, 335, 216,  33, 283,
        50, 207, 153,  25, 347, 151, 284, 331,  69, 149,  19, 216, 104,
       153, 150,  67, 255, 123, 335, 105,  11, 115, 279,  69, 195, 207,
       268, 317, 105, 274, 115,  37, 149,  39, 150, 185])

K Means Clustering - Base Model

In [8]:
tfidf = TfidfVectorizer(stop_words='english', max_features=50)
tfidf.fit(df.DESC.values)
desc_tfidf = tfidf.transform(df.DESC.values)

In [22]:
km = KMeans(n_clusters=30)
km.fit(desc_tfidf.todense())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=30, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [32]:
X_test = ['Take in westerly corridor views of the Puget Sound from your front deck and living areas. Imagine adding a second story to capture bigger views of the Sound and Olympic Mountains. This solid 1940s home sits proudly on a large lot with garden areas, private driveway & 2-car garage. In recent years, new water line run to the street, sewer lined and a trench drain system was installed in the basement. A house loved for 80 years by its original owners. Now ready to be reinvented and loved again.', 
          'Opportunity knocks! 1 bed unit in Duncan Place’s coveted 11 stack. Gorgeous floor to ceiling windows & private deck give the unit a sleek modern feel. Bright & sunny w/ open concept kitchen & center island -perfect for entertaining. Upgrades: new flooring, recessed lighting, designer blinds, & fresh paint. In unit laundry & secure parking. Building amenities: roof deck w/ amazing views, secure access & storage unit. EZ commute by bus or future LT Rail. No rental cap & low dues = great investment.']
tfidf_test = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_test.fit(X_test)
test_desc = tfidf_test.transform(X_test)
test_desc
list(km.predict(test_desc))

[6, 16]

In [26]:
df['labels'] = pd.Series(km.labels_)

In [44]:
df[df['labels'] == 20]['DESC']

17     Welcome home! Amazing craftsmanship, open, lig...
35     Natural light filled home w/ 3 bedrooms & full...
54     One of the largest floorpans in Seola Gardens,...
55     Immaculate move-in ready condo at coveted DayB...
104    Bright & spacious 3 bdrm +bonus rooms w/ custo...
145    A great opportunity to live in Wisteria Statio...
159    Presenting Staunton Cove, a quiet community of...
191    Welcome to highly sought after Point Edwards. ...
232    Opportunity knocks!Enjoy a prime location near...
309    Breathtaking Bellevue residence w/ views of La...
329    Cosmetic improvements and updates will make th...
335    Spacious 2002 Craftsman w/ versatile living sp...
343    2 bedroom 1.75 bath condo in desirable Renton ...
Name: DESC, dtype: object

In [45]:
top_features = np.argmax(km.cluster_centers_, axis=1)

In [11]:
top_features

array([44, 38, 31, 33, 38, 35, 44, 19, 38, 25, 37, 19, 39, 48, 26, 38, 24,
       30,  5, 46, 12, 44,  8, 45, 12, 38, 45,  2, 21, 17])

In [156]:
desc_tfidf[:, top_features].todense()

matrix([[ 0.14887418,  0.14887418,  0.        , ...,  0.23534754,
          0.        ,  0.        ],
        [ 0.58577568,  0.19525856,  0.        , ...,  0.        ,
          0.        ,  0.26320303],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.2775148 ,  0.24483353],
        ..., 
        [ 0.        ,  0.        ,  0.18742187, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.1850518 ,  0.        ,  0.15254881, ...,  0.29253888,
          0.        ,  0.        ],
        [ 0.6276003 ,  0.        ,  0.17245564, ...,  0.16535684,
          0.        ,  0.        ]])

In [157]:
for centroid in km.cluster_centers_:
    dense_art = desc_tfidf.todense()
    indices = np.argsort(centroid)[:10]
    #dense_art[:, indices]
    reversed_vocab = {index: word for word, index in tfidf.vocabulary_.items()}
    print([reversed_vocab[index] for index in indices])

['beautiful', 'level', 'car', 'lot', 'light', 'fenced', 'yard', 'living', 'bedroom', 'windows']
['enjoy', 'views', 'unit', 'storage', 'parking', 'lake', 'location', 'bedrooms', 'level', 'access']
['gas', 'access', 'walk', 'views', 'updated', 'unit', 'storage', 'ss', 'spacious', 'parking']
['views', 'unit', 'parking', 'features', 'enjoy', 'closet', 'access', 'deck', 'yard', 'storage']
['lake', 'windows', 'floors', 'unit', 'features', 'storage', 'location', 'granite', 'level', 'light']
['yard', 'master', 'bedrooms', 'suite', 'home', 'deck', 'windows', 'updated', 'bedroom', 'walk']
['unit', 'lake', 'suite', 'area', 'dining', 'windows', 'updated', 'master', 'open', 'granite']
['bedroom', 'windows', 'location', 'fenced', 'bedrooms', 'room', 'space', 'fireplace', 'patio', 'gas']
['unit', 'car', 'access', 'garage', 'open', 'ss', 'parking', 'location', 'closet', 'close']
['beautiful', 'updated', 'unit', 'parking', 'windows', 'enjoy', 'great', 'fenced', 'fireplace', 'access']
['updated', 'area'

In [158]:
reversed_vocab = {index: word for word, index in tfidf.vocabulary_.items()}
[reversed_vocab[index] for index in top_features]

['new',
 'large',
 'room',
 'floor',
 'bath',
 'unit',
 'yard',
 'lake',
 'room',
 'home',
 'beautiful',
 'bath',
 'views',
 'views',
 'parking',
 'lake',
 'walk',
 'windows',
 'great',
 'views',
 'access',
 'updated',
 'space',
 'lot',
 'level',
 'close',
 'area',
 'home',
 'unit',
 'enjoy']

In [161]:
random_samples = np.array(random.sample(list(np.ndenumerate(km.labels_)), 100))

In [163]:
indeces = [item[0][0] for item in random_samples]

centroids = [item[1] for item in random_samples]
stuff = df.iloc[indeces][['PROPERTY TYPE', 'DESC']]

In [165]:
stuff['centroid'] = centroids

In [165]:
df['FAVORITED'] = 'N'

In [167]:
df

Unnamed: 0,ADDRESS,BATHS,BEDS,CITY,DAYS ON MARKET,DESC,HOA/MONTH,LATITUDE,LOCATION,LONGITUDE,LOT SIZE,PRICE,PROPERTY TYPE,SALE TYPE,SQUARE FEET,SQUAREFT,STATE,STATUS,URL,YEAR BUILT,ZIP,_id,ts,labels,FAVORITED
0,711 153rd St SW,3.00,4,Lynnwood,3,Gorgeous 4 bed 3 bath Lynnwood home w/ high-en...,0.0,47.859744,Lynnwood,-122.243075,7405.0,750000,Single Family Residential,MLS Listing,3252.0,231.0,WA,Active,http://www.redfin.com/WA/Lynnwood/711-153rd-St...,2016.0,98087,5af0cc9461fbeb7738834c4b,2018-05-07 15:00:40.693,28,N
1,4736 S 164th St,1.00,3,Tukwila,3,Head down the driveway to the privacy of this ...,0.0,47.457014,Tukwila,-122.273428,18160.0,460000,Single Family Residential,MLS Listing,1480.0,311.0,WA,Active,http://www.redfin.com/WA/Tukwila/4736-S-164th-...,1960.0,98188,5af0cc9461fbeb7738834c4c,2018-05-07 15:00:40.693,8,N
2,4123 178th Lane SE #2,2.25,2,Bellevue,3,Gorgeous 2Bed 2.25Bath Townhome w/ Stunning Vi...,318.0,47.570999,South Lake Sammamish,-122.102356,114792.0,469000,Townhouse,MLS Listing,962.0,488.0,WA,Active,http://www.redfin.com/WA/Bellevue/4123-178th-L...,1988.0,98008,5af0cc9461fbeb7738834c4d,2018-05-07 15:00:40.693,5,N
3,29003 55th Ave S,2.25,5,Auburn,3,Rare opportunity for this very unique home! Th...,0.0,47.341607,West Hill,-122.266415,49813.0,699950,Single Family Residential,MLS Listing,2750.0,255.0,WA,Active,http://www.redfin.com/WA/Auburn/29003-55th-Ave...,1971.0,98001,5af0cc9461fbeb7738834c4e,2018-05-07 15:00:40.693,25,N
4,9252 11th Ave NW,1.75,3,Seattle,3,"Situated on lg corner lot, meticulously mainta...",0.0,47.697541,Crown Hill,-122.369796,7806.0,725000,Single Family Residential,MLS Listing,2060.0,352.0,WA,Active,http://www.redfin.com/WA/Seattle/9252-11th-Ave...,1942.0,98117,5af0cc9461fbeb7738834c4f,2018-05-07 15:00:40.693,22,N
5,20325 21st Place W,2.75,4,Lynnwood,3,Welcome home to entertainers dream. Soaring 2 ...,42.0,47.813714,Alderwood Manor,-122.262278,9583.0,775000,Single Family Residential,MLS Listing,2859.0,271.0,WA,Active,http://www.redfin.com/WA/Lynnwood/20325-21st-P...,2014.0,98036,5af0cc9461fbeb7738834c50,2018-05-07 15:00:40.693,7,N
6,20306 25th Place W,2.00,4,Lynnwood,3,First time on the market in over 30 years! Thi...,0.0,47.814317,Edmonds,-122.267881,7841.0,525000,Single Family Residential,MLS Listing,1696.0,310.0,WA,Active,http://www.redfin.com/WA/Lynnwood/20306-25th-P...,1965.0,98036,5af0cc9461fbeb7738834c51,2018-05-07 15:00:40.693,18,N
7,14011 Ash Wy Unit A,1.50,2,Lynnwood,3,"Truly one-of-a-kind, this spacious 2 bedroom +...",0.0,47.870600,Lynnwood,-122.246966,16224.0,429000,Condo/Co-op,MLS Listing,1632.0,263.0,WA,Active,http://www.redfin.com/WA/Lynnwood/14011-Ash-Wa...,1964.0,98087,5af0cc9461fbeb7738834c52,2018-05-07 15:00:40.693,6,N
8,507 223rd Place SE,2.50,5,Bothell,3,"Crystal Ridge home, located on quiet cul de sa...",0.0,47.795823,Bothell,-122.224872,12197.0,824900,Single Family Residential,MLS Listing,2686.0,307.0,WA,Active,http://www.redfin.com/WA/Bothell/507-223rd-Pl-...,1989.0,98021,5af0cc9461fbeb7738834c53,2018-05-07 15:00:40.693,8,N
9,9160 53rd Ave W,2.50,3,Mukilteo,3,Immaculate 3 bedroom home tucked away in a gor...,0.0,47.915470,Mukilteo,-122.306072,12362.0,649950,Single Family Residential,MLS Listing,3125.0,208.0,WA,Active,http://www.redfin.com/WA/Mukilteo/9160-53rd-Av...,1999.0,98275,5af0cc9461fbeb7738834c54,2018-05-07 15:00:40.693,1,N
