<a href="https://colab.research.google.com/github/ericodle/Semantic-Positioning-Model-for-Japanese-Adverbs/blob/main/tohoku_BERT_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install required packages

In [None]:
!pip install transformers
!pip install japanize_matplotlib

Import script dependencies

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist

Load model, load csv list of adverbs, and embed adverbs.

In [None]:
# Load the Tohoku Japanese BERT tokenizer and model. replace "bert-large-japanese" with "bert-base-japanese" as needed.
tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-large-japanese")
model = BertModel.from_pretrained("cl-tohoku/bert-large-japanese")

# Load the CSV file with adverbs
csv_file_path = "/content/unique_adverbs_column.csv"

# Read the CSV file using pandas
data = pd.read_csv(csv_file_path, header=None, encoding="utf-8")

# Get the adverbs from the first column
adverbs = data.iloc[:, 0].tolist()
true_labels = data.iloc[:, 1].tolist()

# Calculate embeddings and store them in a list
embeddings = []
for adverb in adverbs:
    # Tokenize and convert to BERT input format
    inputs = tokenizer(adverb, return_tensors="pt")

    # Get the embeddings from the BERT model
    with torch.no_grad():
        output = model(**inputs)
        embedding = output.last_hidden_state[:, 0, :].numpy()  # Using [CLS] token embedding

    embeddings.append(embedding)

# Convert embeddings list to a numpy array
embedding_array = np.array(embeddings)

# Reshape the embedding array for PCA
num_adverbs, num_tokens, embedding_dim = embedding_array.shape

# Flatten the normalized embedding array before applying Min-Max Normalization
flattened_embedding_array = embedding_array.reshape(-1, embedding_dim)

Normalize the embeding values

In [None]:
# Apply Min-Max Normalization
min_max_scaler = MinMaxScaler()
min_max_normalized_embedding_array = min_max_scaler.fit_transform(flattened_embedding_array)

# Reshape the normalized embedding array back to its original shape
min_max_normalized_embedding_array = min_max_normalized_embedding_array.reshape(num_adverbs, num_tokens, embedding_dim)

# Flatten the 3D array to a 2D array
flattened_normalized_embedding_array = min_max_normalized_embedding_array.reshape(-1, embedding_dim)

Reduce dimensionality by PCA, then cluster by K-means.

In [None]:
# Reduce dimensionality using PCA
num_dimensions = 3  # Using 3 components for PCA
pca = PCA(n_components=num_dimensions)
reduced_embedding_array = pca.fit_transform(flattened_normalized_embedding_array)

# Use K-Means clustering with 3 clusters
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(reduced_embedding_array)

# Generate a 3D scatter plot with cluster centroids
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
cluster_names = ['Cluster A', 'Cluster B', 'Cluster C']
for i, cluster_name in enumerate(cluster_names):
    ax.scatter(
        reduced_embedding_array[cluster_labels == i, 0],
        reduced_embedding_array[cluster_labels == i, 1],
        reduced_embedding_array[cluster_labels == i, 2],  # Use the third component
        label=cluster_name
    )
ax.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    kmeans.cluster_centers_[:, 2],  # Use the third component
    marker='X',
    color='black',
    label='Centroids'
)

# Label the points with adverbs and true labels
for i, (adverb, true_label) in enumerate(zip(adverbs, true_labels)):
    ax.text(
        reduced_embedding_array[i, 0],
        reduced_embedding_array[i, 1],
        reduced_embedding_array[i, 2],  # Use the third component
        f'{adverb} ({true_label})',
        fontsize=8,
        ha='center'
    )

ax.set_title('3D Cluster Plot with Centroids')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.legend()

# Save the plot as a PNG image with 600 DPI
plt.savefig('pre_analysis_3d_scatterplot.png', dpi=600)

# Display the plot
plt.show()

# Calculate the distance matrix between cluster centroids
centroid_distances = cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)

# Print the distance matrix
print("Distance Matrix of Centroids:")
print(centroid_distances)

# Save the distance matrix to a CSV file
np.savetxt("pre_dist.csv", centroid_distances, delimiter=",")

Perform Silhouette analysis

In [None]:
# Silhouette Score
sil_scores = []
for n_clusters in range(2, 50):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(flattened_normalized_embedding_array)
    sil_score = silhouette_score(flattened_normalized_embedding_array, cluster_labels)
    sil_scores.append(sil_score)
    print(f"Number of clusters: {n_clusters}, Silhouette Score: {sil_score}")

plt.figure()
plt.plot(range(2, 50), sil_scores)
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')

# Save the plot as a PNG image with 600 DPI
plt.savefig('silhouette_analysis.png', dpi=600)

# Display the plot
plt.show()

Re-analyze adverb embeddings using optimized number of K-means clusters.

In [None]:
# Reduce dimensionality using PCA
num_dimensions = 2
pca = PCA(n_components=num_dimensions)
reduced_embedding_array = pca.fit_transform(flattened_normalized_embedding_array)

# Use K-Means clustering
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(reduced_embedding_array)

# Create a dictionary to map adverbs to their true labels
adverb_label_dict = {adverb: true_label for adverb, true_label in zip(adverbs, true_labels)}

# Create a dictionary to map adverbs to their cluster labels
adverb_cluster_dict = {adverb: cluster_label for adverb, cluster_label in zip(adverbs, cluster_labels)}

# Create a dictionary that links adverbs with true labels to their cluster labels
adverb_true_label_to_cluster = {adverb: {"TrueLabel": adverb_label_dict[adverb], "ClusterLabel": adverb_cluster_dict[adverb]} for adverb in adverbs}

import csv

# Specify the CSV file path
output_csv_file_path = "post_analysis.csv"

# Save the adverb_true_label_to_cluster dictionary as a CSV file
with open(output_csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['Adverb', 'TrueLabel', 'ClusterLabel']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for adverb, labels in adverb_true_label_to_cluster.items():
        writer.writerow({'Adverb': adverb, 'TrueLabel': labels['TrueLabel'], 'ClusterLabel': labels['ClusterLabel']})

print(f"CSV file '{output_csv_file_path}' has been created.")


# Generate a scatter plot with cluster centroids
plt.figure(figsize=(10, 8))
cluster_names = ['Cluster A', 'Cluster B', 'Cluster C', 'Cluster D']
for i, cluster_name in enumerate(cluster_names):
    plt.scatter(
        reduced_embedding_array[cluster_labels == i, 0],
        reduced_embedding_array[cluster_labels == i, 1],
        label=cluster_name
    )
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    marker='X',
    color='black',
    label='Centroids'
)

# Label the points with adverbs and true labels
for i, (adverb, true_label) in enumerate(zip(adverbs, true_labels)):
    plt.annotate(f'{adverb} ({true_label})', (reduced_embedding_array[i, 0], reduced_embedding_array[i, 1]))

plt.title('Cluster Plot with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()

# Save the plot as a PNG image with 600 DPI
plt.savefig('post_analysis_scatterplot.png', dpi=600)

# Display the plot
plt.show()

# Calculate the distance matrix between cluster centroids
centroid_distances = cdist(kmeans.cluster_centers_, kmeans.cluster_centers_)

# Print the distance matrix
print("Distance Matrix of Centroids:")
print(centroid_distances)

# Save the distance matrix to a CSV file
np.savetxt("post_dist.csv", centroid_distances, delimiter=",")