# Afsøgning af mønstre i breve med cluster analyse

Cluster analyse med vektoriserede breve.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dacy
import plotly.express as px
nlp = dacy.load('large')
from sklearn.feature_extraction.text import TfidfVectorizer

df_sentences = pd.read_csv('data/sentences.csv')
df_letters = pd.read_csv('data/letters.csv')

# Kun breve fra Peter
df_letters = df_letters[df_letters['sender'] == 'Peter Mærsk']

# Kun breve fra krigen, Torsdag 16. okt 1913 -> 
df_letters = df_letters[df_letters['date'] > '1913-10-15']
# only sentences from df with letterIds from dfl
df_sentences = df_sentences[df_sentences['letter_id'].isin(df_letters['id'])]

In [None]:
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct and not token.is_space]


# Letter vectors

## Tokenization

Vi opdeler brevene i tokens (ca ord) og joiner dem igen for at kunne vectorisere.

Desuden trunkerer vi dem til 512 ord pga begrænsninger i NLP modellen.



In [None]:
# only from Peter Mærsk
def tokenize_letters(ldf):
    print('Tokenizing letters... wait 32 min')
    ldf['text_processed'] = ldf['text'].str.replace('<PARA>', ' ').astype(str)
    ldf['text_processed'] = ldf['text_processed'].apply(lambda x: x.lower())

    # truncate to 512 words
    ldf['text_processed'] = ldf['text_processed'].str.split().str[:512].str.join(' ')
    ldf['wordcount'] = ldf['text_processed'].str.split().str.len()

    # det her tager ca 32 min
    ldf['letter_tokenized'] = ldf['text_processed'].apply(tokenize)
    ldf['letter_tokenized_joined'] = ldf['letter_tokenized'].apply(' '.join)
    ldf.to_csv('data/letters_tokenized.csv', index=False)

In [None]:


# if tokenized file exists, load it
try:
    df_letters = pd.read_csv('data/letters_tokenized.csv')
except FileNotFoundError:
    tokenize_letters(df_letters)
    df_letters = pd.read_csv('data/letters_tokenized.csv')


lettervectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, preprocessor=None)
lettervectors = lettervectorizer.fit_transform(df_letters['letter_tokenized_joined'])


## Bruger Elbow metoden til at finde det optimale antal clusters. Vælg tallet i "albuen"

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

sum_of_squared_distances = []
K = range(1, 10)  # Adjust range as needed

for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(lettervectors)
    sum_of_squared_distances.append(km.inertia_)

plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k - Letters')
plt.show()


In [None]:
# Assuming 'vectors' is your TF-IDF matrix
num_clusters = 3  # This is an arbitrary choice, you might need to experiment with this number

# Create and fit the model
letterkmeans = KMeans(n_clusters=num_clusters, n_init='auto', max_iter=1000, random_state=42)
letterkmeans.fit(lettervectors)

# Get cluster assignments for each document
letterclusters = letterkmeans.labels_

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'vectors' is your TF-IDF matrix and 'clusters' contains your cluster labels

# Dimensionality Reduction
pca = PCA(n_components=2)  # Reducing to 2 dimensions for visualization
reduced_vectors = pca.fit_transform(lettervectors.toarray())

# Visualization
plt.figure(figsize=(10, 10))
scatter = plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1], c=letterclusters, cmap='viridis',s=10,alpha=0.5)
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.title('Cluster Visualization - Letters')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()

In [None]:

df_plot = pd.DataFrame(reduced_vectors, columns=['PCA1', 'PCA2'])
df_plot['Cluster'] = letterclusters
df_plot['Text'] = df_letters['text'].str.replace('<PARA>', '\n').astype(str)
# Create the plot
fig = px.scatter(df_plot, x='PCA1', y='PCA2', color='Cluster', title='Letter Cluster Visualization', hover_data=['Text'])
fig.show()

# Sentence Tokenization, Vectors and Clustering

In [None]:
# if the tokenized sentences don't exist
def tokenize_sentences(df_sentences):
    print('Tokenizing sentences... wait 45 min')
    df_sentences['sentence_processed'] = df_sentences['sentence'].str.replace('<PARA>', ' ').astype(str)
    df_sentences['sentence_processed'] = df_sentences['sentence_processed'].apply(lambda x: x.lower())
    df_sentences['sentence_tokenized'] = df_sentences['sentence_processed'].apply(tokenize)
    df_sentences['sentence_tokenized_joined'] = df_sentences['sentence_tokenized'].apply(' '.join)
    df_sentences.to_csv('data/sentences_tokenized.csv', index=False)

try:
    df_sentences = pd.read_csv('data/sentences_tokenized.csv')
except FileNotFoundError:
    tokenize_sentences(df_sentences)
    df_sentences = pd.read_csv('data/sentences_tokenized.csv')



In [None]:
vectorizer_sents = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, preprocessor=None)
vectors_sents = vectorizer_sents.fit_transform(df_sentences['sentence_tokenized_joined'])


In [None]:
import matplotlib.pyplot as plt

sum_of_squared_distances = []
K = range(1, 10)  # Adjust range as needed

for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(vectors_sents)
    sum_of_squared_distances.append(km.inertia_)

plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k - Sentences')
plt.show()

In [None]:
# Assuming 'vectors' is your TF-IDF matrix
num_clusters = 4  # This is an arbitrary choice, you might need to experiment with this number

# Create and fit the model
kmeans = KMeans(n_clusters=num_clusters, n_init='auto', max_iter=1000, random_state=42)
kmeans.fit(vectors_sents)

# Get cluster assignments for each document
sentence_clusters = kmeans.labels_

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'vectors' is your TF-IDF matrix and 'clusters' contains your cluster labels

# Dimensionality Reduction
pca = PCA(n_components=2)  # Reducing to 2 dimensions for visualization
reduced_sents_vectors = pca.fit_transform(vectors_sents.toarray())


# Visualization
plt.figure(figsize=(10, 10))
scatter = plt.scatter(reduced_sents_vectors[:, 0], reduced_sents_vectors[:, 1], c=sentence_clusters, s=10,alpha=0.5, cmap='viridis')
plt.legend(*scatter.legend_elements(), title="Sentence Clusters")
plt.title('Sentence Cluster Visualization')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()

In [None]:
#pd.set_option('display.max_rows', 1000)
#df[clusters == 2]['sentence_tokenized_joined'].head(10)

In [None]:
import plotly.express as px
df_plot = pd.DataFrame(reduced_sents_vectors, columns=['PCA1', 'PCA2'])
df_plot['Cluster'] = sentence_clusters
df_plot['Sentence'] = df_sentences['sentence']
# Create the plot

fig = px.scatter(df_plot, x='PCA1', y='PCA2', color='Cluster', title='Sentence Cluster Visualization (Plotly)', hover_data=['Sentence'], width=1000, height=1000, opacity=0.5)
fig.show()


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.transform import factor_cmap
from bokeh.palettes import Viridis4

# Prepare data
cluster_labels_str = [str(label) for label in sentence_clusters]
source = ColumnDataSource(
    data=dict(x=reduced_sents_vectors[:, 0], 
              y=reduced_sents_vectors[:, 1], 
              cluster=cluster_labels_str,
              sentence=df_sentences['sentence']))


# Create figure
p = figure(title="Cluster Visualization with Bokeh", tools="pan,wheel_zoom,box_zoom,reset")
df_plot['Cluster'] = sentence_clusters
df_plot['Sentence'] = df_sentences['sentence']

p.scatter(x='x', y='y', source=source, legend_field='cluster', fill_alpha=0.4, size=5, color=factor_cmap('cluster', palette=Viridis4, factors=sorted(set(cluster_labels_str))))

hover = HoverTool(tooltips=[
    ("Cluster", "@cluster"), 
    ("(x, y)", "($x, $y)"),
    ("Sentence", "@sentence")])
p.add_tools(hover)

show(p)