# BERT embedding clustering with DBSCAN and T-SNE

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)

## Load and prepare data

In [1]:
df = pd.read_csv('Data/jobcloud_features_v2.csv', sep=';', parse_dates=['start_dt', 'end_dt'])

In [4]:
# Turns out that English embeddings work better for clustering than multilingual ones.
#dfe = pd.read_csv('../input/jobcloud-embeddings-multilingual-clean/sentence_embeddings_multilingual_clean.csv', index_col=0)
dfe = pd.read_csv('Embeddings/sentence_embeddings_en_clean.csv', index_col=0)
dfe.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.326405,0.241112,0.175392,-0.020332,0.236746,0.045963,0.119029,-0.08981,-0.071976,-0.482344,...,0.341688,-0.19108,0.265163,-0.252638,-0.364661,-0.173336,-0.015667,-0.438569,-0.007369,0.186516
1,-0.165292,0.32402,0.090227,-0.297666,0.633303,-0.179377,0.406672,0.118778,-0.055366,-0.529219,...,-0.028928,-0.146348,0.63155,-0.185177,-0.017117,0.018831,0.051867,-0.614038,0.0691,-0.112539
2,-0.326405,0.241112,0.175392,-0.020332,0.236746,0.045963,0.119029,-0.08981,-0.071976,-0.482344,...,0.341688,-0.19108,0.265163,-0.252638,-0.364661,-0.173336,-0.015667,-0.438569,-0.007369,0.186516


In [5]:
def get_stratified_sample(df: pd.DataFrame, feature: str, samples_per_category: int) -> pd.DataFrame:
    df_stratified = df.groupby(feature, group_keys=False).apply(lambda x: x.sample(min(len(x), samples_per_category)))
    return df_stratified

df_concat = pd.concat([df, dfe], axis=1)
df_sample = get_stratified_sample(df_concat, 'industry_name', 3000).reset_index()

# List amount of samples per category
df_sample['industry_name'].value_counts()

Baugewerbe/Immobilien            3000
öffentl. Verwaltung/ Verbände    3000
Banken/ Finanzinstitute          3000
Versicherungen                   3000
Industrie diverse                3000
Chemie/Pharma                    3000
Informatik/Telekommunikation     3000
Gewerbe/Handwerk allgemein       3000
Dienstleistungen allgemein       3000
Gesundheits-/Sozialwesen         3000
Detail-/Grosshandel              3000
Maschinen-/Anlagenbau            3000
Transport/Logistik               2973
Konsum-/Luxusgüterindustrie      2007
Beratung diverse                 1770
Gastgewerbe/Hotellerie           1545
Medizinaltechnik                 1516
Energie-/Wasserwirtschaft        1384
Bildungswesen                    1230
Rechts-/Wirtschaftsberatung      1166
Medien/Druckerei/Verlage          958
Personalberatung                  901
Tourismus/Reisen/Freizeit         664
Land-/Forstwirtschaft/Holz        249
Name: industry_name, dtype: int64

In [6]:
df_sample.shape

(52363, 800)

In [7]:
# Remove cleaned title duplicates, we don't want duplicates for clustering
df_sample.drop_duplicates(subset='title_clean', inplace=True)
df_sample.shape

(32799, 800)

## DBSCAN Clustering

In [8]:
from sklearn.neighbors.ball_tree import BallTree

# Normalize embeddings to length = 1 so cosine similarity is equal to euclidian distance. 
# The ball_tree algorithm is supposed to be faster for clustering, and supports euclidian distance, but not cosine similarity.
# Turns out that there's no considerable performance gain though, so this code is just for documentation.

#embeddings = df_sample[df_embeddings.columns].values
#row_sums = embeddings.sum(axis=1)
#embeddings_normed = embeddings / row_sums[:, np.newaxis]
#embeddings_normed.shape
#db = DBSCAN(metric='euclidean', algorithm='ball_tree', eps=0.04, min_samples=10, n_jobs=-1).fit(embeddings_normed)
print(BallTree.valid_metrics)

['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity', 'seuclidean', 'mahalanobis', 'wminkowski', 'hamming', 'canberra', 'braycurtis', 'matching', 'jaccard', 'dice', 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'haversine', 'pyfunc']


In [9]:
db = DBSCAN(metric='cosine', eps=0.04, min_samples=10).fit(df_sample[dfe.columns])

NOISE_LABEL = -1 # DBSCAN assigns unclustered points to the -1 label (noise points)
labels = db.labels_
n_clusters = len(set(labels)) - (1 if NOISE_LABEL in labels else 0)
n_points = len(df_sample)
n_noise = list(labels).count(NOISE_LABEL)

print('Points clustered: %.2f%%' % ((1 - n_noise / n_points) * 100))
print('Total points: %d' % n_points)
print('Noise points: %d' % n_noise)
print('Number of clusters: %d' % n_clusters)
print('Cluster label / number of points:')
print(pd.Series(labels).value_counts())

Points clustered: 1.04%
Total points: 32799
Noise points: 32459
Number of clusters: 15
Cluster label / number of points:
-1     32459
 11       45
 1        45
 3        39
 14       31
 0        29
 5        24
 7        20
 8        19
 6        17
 4        14
 10       13
 2        13
 9        11
 13       10
 12       10
dtype: int64


In [10]:
df_sample['label'] = labels
# Bokeh legends require string labels
df_sample['label_str'] = df_sample['label'].apply(str).replace(str(NOISE_LABEL), 'noise')

## TSNE plot

In [11]:
def get_pca(df: pd.DataFrame, n_components: int = 50) -> pd.DataFrame:
    pca = PCA(n_components=n_components, random_state=42)
    pca_columns = ['pca_%i' % i for i in range(n_components)]
    df_pca = pd.DataFrame(pca.fit_transform(df), columns=pca_columns, index=df.index)
    return df_pca

# Remove noise (unclustered points) for speeding up T-SNE
df_clustered = df_sample[df_sample['label'] != NOISE_LABEL].copy()

# Run PCA first to speed up consecutive T-SNE (recommended practice for more than 50 features)
df_pca = get_pca(df_clustered[dfe.columns])
df_pca.shape

(340, 50)

In [12]:
# Create a TSNE instance: model
model = TSNE(learning_rate=300, random_state=42)
tsne_features = model.fit_transform(df_pca)
xs = tsne_features[:,0]
ys = tsne_features[:,1]

In [13]:
df_clustered['x'] = xs
df_clustered['y'] = ys

In [14]:
def get_cluster_medoids(df: pd.DataFrame) -> dict:
    medoids = {}
    for label in set(labels):
        if label == -1:
            continue
        df_cluster = df.loc[df['label'] == label]
        distances = 1 - cosine_similarity(df_cluster[dfe.columns])

        medoids[label] = df_cluster.iloc[np.argmin(distances.sum(axis=0))]['index']

    return medoids
  
medoids = get_cluster_medoids(df_clustered)

In [15]:
from bokeh.io import output_file,show,output_notebook,save
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
from bokeh.layouts import row, gridplot
import bokeh.palettes

output_notebook()

medoid_mask = df_clustered['index'].isin(list(medoids.values()))
noise_mask = df_clustered['label'] == -1
source_clustered = ColumnDataSource(df_clustered[~noise_mask & ~medoid_mask])
source_medoid = ColumnDataSource(df_clustered[medoid_mask])
source_noise = ColumnDataSource(df_clustered[noise_mask])

hover = HoverTool(tooltips=[('title', '@title'), ('industry', '@industry_name'), ('cluster', '@label_str')])

p = figure(tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'], plot_width=1000, plot_height=600, toolbar_location='right')

# Clustered points
p.circle(x='x', y='y', 
         color=factor_cmap('label_str', palette=bokeh.palettes.Category20[20], factors=df_clustered['label_str'].unique()), 
         source=source_clustered, alpha=.7, size=6
)
# Medoids
p.circle(x='x', y='y', 
         color=factor_cmap('label_str', palette=bokeh.palettes.Category20[20], factors=df_clustered['label_str'].unique()), 
         source=source_medoid, alpha=1, size=10, #legend='title', 
)
# Noise
#p.circle(x='x', y='y', color='grey', source=source_noise, alpha=.2, legend="label_str",size=5)

#p.legend.location = "top_left"
#p.legend.background_fill_alpha = 0.0
#p.legend.visible = False

#p.legend.orientation = "horizontal"
#p.legend.click_policy = "hide"

show(p)

In [2]:
output_file("bokeh_jobtitle_clusters.html")
save(p)