# Cloud MongoDB + Vectorizer + KMeans

- Build a model using MongoDB and CountVectorizer
- Build clusters with KMeans

In [1]:
# Imports
import pymongo

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

%matplotlib inline

## MongoDb Connection

In [2]:
# Load the client
client = pymongo.MongoClient("mongodb+srv://dbuser:dbpassword@cluster0-tdmr9.mongodb.net/test?retryWrites=true&w=majority")

# Load the database
database = client['sample_training']

# Load a collection
companies = database['companies']

# Load the instances
instances = companies.find()

# Build a dataframe using the instances
dataframe = pd.DataFrame(instances)

## Cleaning, transforming and encoding of data

In [39]:
dataframe = dataframe[['description']]

dataframe.replace('', np.nan, inplace=True)
dataframe.dropna(how='all', inplace=True)

dataframe.head()

Unnamed: 0,description
0,Content discovery service
1,Read Unlimited Books
2,Social network
4,Technology Platform Company
5,Real time communication platform


## Build the clusters

In [51]:
# binary:          True | False  -- Exist or Count
# lowercase:       True | False  --
# analyser:        'word'|'char' --
# strip_accents:   'unicode'     -- é != e 
# stop_words:      ['word']      -- Ignore word
# ngram_range:     [min, max]    -- Combine words
# min_def:         number        -- Min of ocurrence 

count_vectorizer = CountVectorizer(min_df=4, ngram_range=(2, 3))
words_matrix = count_vectorizer.fit_transform(dataframe['description'])
words_list = count_vectorizer.get_feature_names()
    
for k in range(2,10):
    cluster = make_pipeline(MaxAbsScaler(), KMeans(n_clusters=k, random_state=0))
    cluster.fit(words_matrix)
    predict = cluster.predict(words_matrix)
    
    silhouette = silhouette_score(words_matrix, predict)
    print("K = {} - Silhouette: {}".format(k, silhouette))

K = 2 - Silhouette: 0.5305733426572677
K = 3 - Silhouette: 0.4717291693332289
K = 4 - Silhouette: 0.477122675528153
K = 5 - Silhouette: 0.4736389544899812
K = 6 - Silhouette: 0.4778357456147973
K = 7 - Silhouette: 0.479937642030038
K = 8 - Silhouette: 0.46310718290760083
K = 9 - Silhouette: 0.46149689367279245


## Clusters & Examples

In [55]:
dataframe['cluster'] = predict

for cluster in np.unique(predict):
    rank = pd.Series(centroids[cluster, :], index=words_list).sort_values().tail(1)
    example = dataframe[dataframe['cluster'] == cluster]['description'].iloc[0]
    count = dataframe[dataframe['cluster'] == cluster].count()
    
    print("Cluster {}".format(cluster))
    print("--  Key: {}".format(rank.keys()[0]))
    print("--  Value: {}".format(rank.values[0]))
    print("--  Count: {}".format(count.values[0]))
    print("--  Example: {}".format(example))
    
    print()

Cluster 0
--  Key: social media
--  Value: 0.014729209154769997
--  Count: 4413
--  Example: Content discovery service

Cluster 1
--  Key: social network
--  Value: 1.0
--  Count: 108
--  Example: Social network

Cluster 2
--  Key: management software
--  Value: 1.0
--  Count: 43
--  Example: Risk Management Software

Cluster 3
--  Key: marketing and advertising
--  Value: 1.0
--  Count: 7
--  Example: Marketing and Advertising 

Cluster 4
--  Key: search engine
--  Value: 1.0
--  Count: 59
--  Example: job-focused vertical search engine

Cluster 5
--  Key: open source
--  Value: 1.0
--  Count: 17
--  Example: Open Source as a service

Cluster 6
--  Key: web design
--  Value: 1.0
--  Count: 44
--  Example: Web Design and Development

Cluster 7
--  Key: software solutions
--  Value: 1.0
--  Count: 20
--  Example: enterprise software solutions

Cluster 8
--  Key: engine for
--  Value: 1.0
--  Count: 5
--  Example: search engine for music

