In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Math
import seaborn as sns
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
from memoization import cached
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

plt.style.use("seaborn-darkgrid")

train_bunch = fetch_20newsgroups(subset="train")
steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=10, random_state=0))
]
X_news = Pipeline(steps).fit_transform(train_bunch['data'])[::5]
y_news = train_bunch['target'][::5]

In [None]:
@cached
def get_rand_score(distance_threshold, X, y, eps=1e-3):
    predicted = AgglomerativeClustering(
        distance_threshold=distance_threshold, n_clusters=None, linkage='single').fit_predict(X)
    return eps + adjusted_rand_score(predicted, y)


left = 0
right = 2
distance_thresholds = np.arange(left,right, 0.05)
scores = [get_rand_score(distance_threshold, X_news, y_news) for distance_threshold in distance_thresholds]

In [None]:
def mcmc(steps, left, right):
    states = []
    current = np.random.uniform(left, right)
    for i in tqdm(range(steps)):
        states.append(current)
        movement = np.random.uniform(left, right)

        curr_prob = get_rand_score(distance_threshold=current, X=X_news, y=y_news)
        move_prob = get_rand_score(distance_threshold=movement, X=X_news, y=y_news)

        # Flip a coin to determine whether or not to move
        if np.random.uniform(0, 1) < min(move_prob / curr_prob, 1):
            current = movement
    return states[burn_in:]

steps = 1000
burn_in = int(steps * 0.2)
raw_samples = mcmc(steps, left=left, right=right)
samples = raw_samples[burn_in:]

In [None]:
%matplotlib inline

plt.figure(figsize=(25,10))

plt.subplot(1,2,1)
plt.hist(samples, density=True)
plt.title("Samples from {} Fit Over 20 Newsgroups Dataset".format(chr(956)), fontsize=20)
plt.xlabel("Distance Threshold Used By Single Linkage", fontsize=20)
plt.ylabel("Normalized Proportion of Samples From {}".format(chr(956)), fontsize=20)

plt.subplot(1,2,2)
plt.plot(distance_thresholds, scores)
plt.title("Distance Threshold and Adjusted Rand Score Over 20 Newsgroups Dataset", fontsize=20)
plt.xlabel("Distance Threshold Used By Single Linkage", fontsize=20)
plt.ylabel("Adjusted Rand Score Over 20 Newsgroups Dataset", fontsize=20)