# Palo Alto POI and Clustering

I am going to use basic nltk packages for stopwords, tokens, vectorizer, Stemmer and Lemmatization. Scikit-learn for Vectorizer, Normalizer, metrices and clustering algorithm(KMean)

In [115]:
# importing all necessary packages
from __future__ import print_function
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")

import numpy as np
import pandas as pd

import string
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
stemmer = SnowballStemmer('english')
from nltk.corpus import stopwords

import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
headers={'User-Agent':user_agent,}

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

Scrapping the data from few Travel related websites using BeautifulSoup and making ready for preprocessing

In [116]:
'''
# Scrapping data from few pages using BeautifulSoup
'''

# method for scrapping the data
def scrap_data(url):
    request=urllib.request.Request(url,None,headers) #The assembled request
    response = urllib.request.urlopen(request).read()

    soup = BeautifulSoup(response, "html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

Code to read all urls and save to local file for further usages.

In [3]:
# scrap data and save to file
url = ['https://wikitravel.org/en/Palo_Alto']
urls = ['https://www.tripadvisor.in/Attractions-g32849-Activities-Palo_Alto_California.html', 'https://wikitravel.org/en/Palo_Alto', 'https://en.wikivoyage.org/wiki/Palo_Alto', 'https://en.wikipedia.org/wiki/List_of_attractions_in_Silicon_Valley','http://vacationidea.com/california/best-things-to-do-in-palo-alto-ca.html', 'https://wikitravel.org/en/Palo_Alto']
data = ""
for each_url in urls:
    data += scrap_data(each_url)
with open("data.txt", "w", encoding="utf-8") as text_file:
    text_file.write(data)

methods to read data from files and clean the recieved data. Removed punctuation, whitespaces, numeric data, and stopwords. Also have applied stemming on the tokens. Any tokens of one length is removed.

In [87]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
print("%d documents" % len(dataset.data))

3387 documents


In [152]:
lemmatizer = WordNetLemmatizer()

import re

EMAIL_REGEX = re.compile(r"[^@]+@[^@]+\.[^@]+")
news_data = dataset.data

# load data from file
def load_doc(filename):
    file = open(filename, 'r', encoding="utf-8")
    text = file.read()
    file.close()
    return text

# clean data and stem it
def clean_doc(data):
    tokens = data.lower().split()
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove all numeric data
    tokens = [word for word in tokens if word.isalpha()]
    # Check for email
    tokens = [word for word in tokens if not EMAIL_REGEX.match(word)]
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # remove words with length 1
    tokens = [w for w in tokens if len(w) > 1]
    # remove hyperlinks
    tokens = [w for w in tokens if not "www" in w]
    # remove words that is not english
    tokens = [w for w in tokens if wordnet.synsets(w)]
    #stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # lemmatizing the words
    lemma_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # removing duplicates words
    tokens = list(set(lemma_tokens))
    return tokens

filename = 'data.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['ridge', 'bullet', 'served', 'pedigree', 'crowning', 'artisan', 'flaky', 'read', 'louisville', 'produced', 'irrigation', 'special', 'bird', 'intended', 'possible', 'arthur', 'also', 'huntsville', 'consists', 'resort', 'coming', 'falter', 'family', 'californian', 'project', 'referred', 'padre', 'office', 'listed', 'rare', 'bruno', 'pizza', 'norway', 'concentrate', 'planetarium', 'try', 'major', 'jean', 'emphasis', 'every', 'cardinal', 'use', 'transport', 'drive', 'monte', 'understated', 'asian', 'distinctive', 'become', 'palace', 'peter', 'moving', 'inspired', 'needed', 'tidal', 'six', 'tap', 'across', 'augustine', 'accented', 'offered', 'complimentary', 'connect', 'black', 'bruges', 'duluth', 'fargo', 'elope', 'camden', 'sharing', 'meat', 'appears', 'privacy', 'preparation', 'naples', 'prescott', 'dormitory', 'moment', 'foothill', 'hexagonal', 'sport', 'garden', 'expectation', 'pet', 'dc', 'nh', 'created', 'seeing', 'dayton', 'size', 'athletic', 'operating', 'missouri', 'mini', 'check

In [153]:
all_data = []
for each in news_data:
    all_data.append(clean_doc(each))
flat_list = [item for sublist in all_data for item in sublist]

In [158]:
# vectorize doc using tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(tokens)
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 2190, n_features: 2105


In [135]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vect.fit_transform(dataset.data)
print("n_samples: %d, n_features: %d" % X_vec.shape)

n_samples: 3387, n_features: 43255


Performing dimension reduction using latent semantic analysis(LSA) as it works well with spacy sparce matrix and tfidf vector.

In [155]:
# Convert spacy sparce matrix to pandas dataframe
coo = X.tocoo(copy=False)
df = pd.DataFrame({'index': coo.row, 'col': coo.col, 'data': coo.data}
                 )[['index', 'col', 'data']].sort_values(['index', 'col']
                 ).reset_index(drop=True)
df.head()

Unnamed: 0,index,col,data
0,0,16950,1.0
1,1,4479,1.0
2,2,15327,1.0
3,3,17644,1.0
4,4,16481,1.0


In [159]:
print("Performing dimensionality reduction using LSA")

# Normalizing the data for best Kmean clustering performance
svd = TruncatedSVD()
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_X = lsa.fit_transform(X)

# check variance
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()


# #############################################################################
# Do the actual clustering    
def do_clustering(X, n_clust, need_model = False):
    km = MiniBatchKMeans(n_clusters=n_clust, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1)
    #km = KMeans(n_clusters=n_clust, init='k-means++', max_iter=300, n_init=10)

    print("Clustering sparse data with %s" % km)

    km.fit(X)

    score = "%0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)
    print('Silhouette Coefficient: ',score)
    
    # return Silhouette score or model which ever is need by method
    if need_model is False:
        return float(score)
    else:
        return km

# getting best Silhouette score with that model
score_lst = []
for i in range(2, 8):
    score= do_clustering(lsa_X, i)
    score_lst.append(score)
get_max_score_index = score_lst.index(min(score_lst))
n_clust = get_max_score_index + 2

print('------------------------------------------------------------------')
print('Best Silhouette Coefficient score: {} with cluster number: {}'.format(min(score_lst), n_clust))
print('------------------------------------------------------------------')

# getting model and finding best cluster
km = do_clustering(lsa_X, n_clust, need_model = True)
print("Top terms per cluster:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(n_clust):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Performing dimensionality reduction using LSA
Explained variance of the SVD step: 0%

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=2,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=1)
Init 1/1 with method: k-means++
Inertia for init 1/1: 587.800930
Minibatch iteration 1/300: mean batch inertia: 0.575947, ewa inertia: 0.575947 
Minibatch iteration 2/300: mean batch inertia: 0.570472, ewa inertia: 0.570949 
Minibatch iteration 3/300: mean batch inertia: 0.566611, ewa inertia: 0.566989 
Minibatch iteration 4/300: mean batch inertia: 0.593639, ewa inertia: 0.591316 
Minibatch iteration 5/300: mean batch inertia: 0.594751, ewa inertia: 0.594452 
Minibatch iteration 6/300: mean batch inertia: 0.589957, ewa inertia: 0.590349 
Minibatch iteration 7/300: mean batch inertia: 0.593635, ewa inertia: 0.593348 
Minibatch iteration 8/30

As seen here, best Silhouette score is for 2 clusters. So, we will take cluster number as 2 for our KMeans model and will train the model. As seen in the above result, 1st cluster is talking about general place at Palo Alto, while 2nd cluster is talking about educational place. The result is little ambigious as scrapped data is less and not much organized and also we need to fine tuned the model in the next level.