In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.cm as cm
from sklearn import  preprocessing, metrics
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

### Reading the Text Documents

In [None]:
df_sample = pd.read_csv("inputs\reddit_politics_subredditC_multiC.tsv", sep="\t")
df_sample.head()

In [None]:
docs = df_sample["text"] # comment
docs.replace([np.inf, -np.inf, ''], np.nan, inplace=True)
docs.dropna(inplace=True)
docs.shape

### Encoding Texts using the SBERT

In [None]:
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

X = np.array(model.encode(docs.to_list()))
print("mean: ", X.mean())
X.shape

### Dimensionality Reduction & Clustering

In [None]:
scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled.mean()

In [None]:
X_reduced_tsne = TSNE(n_components=2, init='random').fit_transform(X_scaled)
tsne_comps = pd.DataFrame(X_reduced_tsne[:,:2], columns=["C1", "C2"])
X_reduced_tsne.shape

In [None]:
#pca = PCA(n_components=2, whiten=False,svd_solver='auto')
pca = PCA(n_components=10, svd_solver='full')
X_reduced = pca.fit_transform(X_scaled)
print('pca.components: \n', pca.components_)
print('pca.explained variance:\n', pca.explained_variance_)
print('pca.explained variance ratio:\n', pca.explained_variance_ratio_)
print(np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100))

In [None]:
pca_comps = pd.DataFrame(X_reduced[:,:2], columns=["C1", "C2"])
X_reduced.shape

## K-Means Clustering

In [None]:
range_n_clusters = np.arange(2,15,1)
elbow = []
ss = []
for k in range_n_clusters:
    #iterating through cluster sizes
    clusterer = KMeans(n_clusters = k, random_state=10)
    cluster_labels = clusterer.fit_predict(X_reduced)
    #Finding the average silhouette score
    silhouette_avg = silhouette_score(X_reduced, cluster_labels)
    ss.append(silhouette_avg)
    print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg)
    #Finding the average SSE"
    elbow.append(clusterer.inertia_) # Inertia: Sum of distances of samples to their closest cluster center

In [None]:
fig = plt.figure(figsize=(18,10))
fig.add_subplot(221)
plt.plot(range_n_clusters, elbow,'b-',label='Sum of squared error')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.title("WSS Graph")
plt.legend()
fig.add_subplot(223)
plt.plot(range_n_clusters, ss,'b-',label='Silhouette Score')
plt.xlabel("Number of cluster")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Graph")
plt.legend()
plt.savefig("figures\trirony_kmeans_multic_wss.pdf", bbox_inches='tight')

## DBScan

In [None]:
epsilons = np.arange(0.1, 2, 0.1)
min_samples = np.arange(5, 15, 1)
ss = []
for i in min_samples:
    for j in epsilons:
        db = DBSCAN(eps=j, min_samples=i, metric='cosine').fit(X_reduced)
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        if len(set(labels)) > 2:
            silhouette_avg = silhouette_score(X_reduced, labels, metric="cosine")
            ss.append(silhouette_avg)
            
            adj_rand = metrics.adjusted_rand_score(df_sample["label"], labels)
            if adj_rand > 0:
                print("params: ", (i,j))
                if silhouette_avg > 0:
                    print("Silhouette: ", silhouette_avg)
                    print("Estimated number of clusters: %d" % n_clusters_)
                    print("Adjusted Rand Index: %0.3f" % adj_rand)


## Visualizing the Clusters

In [None]:
plt.figure(figsize=(12,8))
# The Best Clustering with K-Means Clustering
n_clusters = 3

# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X_reduced)

# 1st Plot showing the K-Means Results
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
plt.scatter(
    tsne_comps.C1, tsne_comps.C2, marker=".", s=50, lw=0, alpha=0.7, c=colors, edgecolor="k"
)


# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
"""
plt.scatter(
    centers[:, 0],
    centers[:, 1],
    marker="o",
    c="white",
    alpha=1,
    s=200,
    edgecolor="k",
)
"""
#for i, c in enumerate(centers):
#    plt.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

plt.title("The visualization of the K-Means clustered data.")
plt.xlabel("Feature space for the 1st feature")
plt.ylabel("Feature space for the 2nd feature")
plt.savefig("C:\\Users\\frat1\\Desktop\\reddit_kmeans_multic_clusters_tsne.pdf", bbox_inches='tight')

In [None]:
plt.figure(figsize=(12,8))
# The Best Clustering with K-Means Clustering
db = DBSCAN(eps=0.5, min_samples=10, metric="cosine").fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)


# 2nd Plot showing the DBScan Results
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = pca_comps[class_member_mask & core_samples_mask]
    plt.scatter(
        xy.C1,
        xy.C2,
        marker=".",
        #markerfacecolor=tuple(col),
        #markeredgecolor="k",
        #markersize=14,
    )

    xy = pca_comps[class_member_mask & ~core_samples_mask]
    plt.scatter(
        xy.C1,
        xy.C2,
        marker=".",
        #markerfacecolor=tuple(col),
        #markeredgecolor="k",
        #markersize=6,
    )

plt.title("The visualization of the DBScan clustered data.")
plt.xlabel("Feature space for the 1st feature")
plt.ylabel("Feature space for the 2nd feature")


plt.show()

### Transform to CausalText Input Format

In [None]:
df_merged = df_sample.drop(columns=['C', 'Unnamed: 0'])
df3 = pd.concat([df_merged.reset_index(drop=True), pd.DataFrame(cluster_labels, columns=["C"])], axis=1)
df3.head()

In [None]:
df3.to_csv("inputs/reddit_politics_kmeansC_multiC_v3.tsv", sep="\t")