## Read translated data

In [1]:
import pandas as pd

In [None]:
path = '/data/preprocessed_language_dic.parquet'
df = pd.read_parquet(path)

### Translated high level commands

In [6]:
df_tool_menu = df[df['cat'].isin(['Tool', 'Menu'])]
df_tool_menu

Unnamed: 0,message,count,id,cat,translated_message,message_language,language_confidence
2,Tool: Reshape (-214),29945982,-214,Tool,Tool: Reshape (-214),en,1.000000
5,Tool: Move by Points (-352),21365110,-352,Tool,Tool: Move by Points (-352),en,1.000000
8,Tool: Line (-201),17893527,-201,Tool,Tool: Line (-201),en,1.000000
10,Menu: Save - (-5) (0),17072882,-50,Menu,Menu: Save - (-5) (0),en,0.966537
14,Menu: Copy - (-28) (0),13848027,-280,Menu,Menu: Copy - (-28) (0),en,0.744751
...,...,...,...,...,...,...,...
21103,Menu: XG Unit from Polygon - (393) (0),11,3930,Menu,Menu: XG Unit from Polygon - (393) (0),en,1.000000
21104,Menu: Window Arranging Items - Baldosa (-520) (1),11,-5201,Menu,Menu: Window Arranging Items - Baldosa (-520) (1),en,0.857517
21106,Menu: WorkspacesDialog - (202) (0),11,2020,Menu,Menu: WorkspacesDialog - (202) (0),en,0.902133
21107,Menu: CExtMenuDatabase - Externe Datenquelle a...,11,474,Menu,Menu: CExtMenuDatabase - Update external data ...,de,1.000000


### If there is only one command for one ID, put this translated message to the label of this ID

In [47]:
group_data = df.groupby('id')
processed_data = []
one_id_one_command = []
for session_id,group_df in group_data:
    
    if len(group_df) == 1:
        one_id_one_command.append(session_id)
        group_df['label'] = group_df['translated_message']
    else:
        group_df['label'] = pd.NA
        
    processed_data.append(group_df)
df_first = pd.concat(processed_data)

In [None]:
mask = pd.isna(df_first['label'])
df_first_result = df_first[~mask]
df_second = df_first[mask]
print()

### Embedding for the rest of commands

In [5]:
import voyageai
from getpass import getpass
from tqdm.auto import tqdm

In [6]:
api_key = getpass()

 ········


In [7]:
vo = voyageai.Client(api_key)

In [8]:
def get_embedding(texts):
    result = vo.embed(texts, model="voyage-large-2-instruct", input_type="document")
    return result.embeddings

In [None]:
tqdm.pandas()
# The process has been preprocessed and stored as parquet
df_second["embedding"] = df_second['translated_message'].progress_apply(lambda x: get_embedding(x))

In [None]:
df_second = pd.read_parquet('/Users/embedded.parquet')

In [102]:
print(df_second['embedding'])

477      [[0.024435998871922493, 0.041921861469745636, ...
560      [[0.016089636832475662, 0.0488973930478096, -0...
631      [[0.01773480512201786, 0.023533957079052925, -...
722      [[0.014917957596480846, 0.040976788848638535, ...
742      [[0.014719028025865555, 0.027321459725499153, ...
                               ...                        
1022     [[0.00798321608453989, 0.023588592186570168, -...
3186     [[0.01366106141358614, -0.006066086236387491, ...
10156    [[-0.002803422976285219, -0.008399300277233124...
10733    [[-0.00612988555803895, 0.005952127277851105, ...
16237    [[-0.0038755624555051327, -0.00277991290204226...
Name: embedding, Length: 18895, dtype: object


In [None]:
import numpy as np
grouped_data = df.groupby('id')
processed_data = []
one_command_groups = []

for session_id, group_df in tqdm(grouped_data, desc="Processing similarity test"):
        
    if len(group_df) < 2:
        # Skip groups with fewer than two items, as we cannot compute pairwise similarities
        one_command_groups.append(session_id)
        continue
    
    # Stack embeddings into a matrix
    matrix = np.vstack(group_df['embedding'].values)
    
    # Calculate cosine similarity matrix
    similarity_matrix = cosine_similarity(matrix)
    
    # Extract upper triangle indices, ignoring the diagonal
    upper_triangle_indices = np.triu_indices_from(similarity_matrix, k=1)
    pairwise_similarities = similarity_matrix[upper_triangle_indices]
    
    # Calculate statistics
    length = len(group_df)
    mean_similarity = np.mean(pairwise_similarities)
    median_similarity = np.median(pairwise_similarities)
    std_dev_similarity = np.std(pairwise_similarities)
    
    # Store results in a list or another data structure
    processed_data.append({
        'id': session_id,
        'length': length,
        'mean_similarity': mean_similarity,
        'median_similarity': median_similarity,
        'std_dev_similarity': std_dev_similarity
    })

# Convert processed data into a DataFrame for easier analysis and reporting
results_df = pd.DataFrame(processed_data)
print(results_df)

In [None]:
results_df.to_csv('/Users/similarity_test.csv')

### The next step would be the clustering of the ID based on the embedding of the command, set the center point's command name to each clustered ID

In [438]:
def to_matrix(df):
    # Extract the 'embedding' column
    embeddings = df['embedding']
    
    # Flatten the nested lists within the 'embedding' column
    flattened_embeddings = [item for sublist in embeddings for item in sublist]
    
    # Convert the flattened list into a numpy array and reshape if needed
    matrix = np.array(flattened_embeddings).reshape(len(embeddings), -1)
    
    return matrix

In [457]:
def optimize_dbscan(matrix):
    def dbscan_silhouette_score(params):
        eps = params[0]
        min_samples = 1  # Fix min_samples to 1
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(matrix)
        labels = db.labels_
        
        # Ensure there is more than one cluster (excluding noise)
        num_labels = len(set(labels)) - (1 if -1 in labels else 0)
        if num_labels <= 1 or num_labels == len(matrix):
            return 1  # Return a high score if there is only one cluster or no clusters, or if each sample is its own cluster
        
        score = silhouette_score(matrix, labels)
        return -score  # Minimize negative silhouette score

    # Adjusted search space
    search_space = [
        Real(0.05, 0.5, name='eps')  # Narrow the range for eps
    ]

    # Perform Bayesian optimization
    res = gp_minimize(
        dbscan_silhouette_score, 
        search_space, 
        n_calls=100, 
        random_state=0,
        n_random_starts=10,  # Add more random starts to diversify initial points
    )

    best_params = {'eps': res.x[0], 'min_samples': 1}
    return best_params

In [458]:
def calculate_similarity(matrix):
    
    similarity_matrix = cosine_similarity(matrix)
    upper_triangle_indices = np.triu_indices_from(similarity_matrix, k=1)
    pairwise_similarities = similarity_matrix[upper_triangle_indices]
    return np.median(pairwise_similarities)


In [459]:
def find_closest_to_center(matrix):
    mean_vector = np.mean(matrix, axis=0)
    distances = np.linalg.norm(matrix - mean_vector, axis=1)
    min_index = np.argmin(distances)
    return min_index  # Return the index of the closest vector

In [460]:
def set_label(df):
    count_en = (df['message_language'] == 'en').sum()
    if count_en == 1:
        # Directly use the message from the single English entry as the label
        english_message = df[df['message_language'] == 'en']['message'].iloc[0]
        df['label'] = english_message
    else:
        matrix = to_matrix(df)
        closest_index = find_closest_to_center(matrix)
        # Use the closest index to fetch the corresponding message
        closest_message = df.iloc[closest_index]['translated_message']
        df['label'] = closest_message

In [461]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.metrics import silhouette_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.metrics.pairwise import cosine_similarity
from skopt import gp_minimize

from itertools import product


import numpy as np

In [462]:
def cluster_dbscan(matrix, df):
    best_params = optimize_dbscan(matrix)
    db = DBSCAN(eps=best_params['eps'], min_samples=best_params['min_samples']).fit(matrix)
    df['cluster_label'] = db.labels_

In [463]:
df_second_results = df_second

In [464]:
df_second_results.head()

Unnamed: 0,message,count,id,cat,translated_message,message_language,language_confidence,label,embedding
477,End Event: Export DXF/DWG... (-1),154786,-1,End Event,End Event: Export DXF/DWG... (-1),en,1.0,,"[[0.024435998871922493, 0.041921861469745636, ..."
560,End Event: Import DXF/DWG/DWF (-1),111143,-1,End Event,End Event: Import DXF/DWG/DWF (-1),en,0.715005,,"[[0.016089636832475662, 0.0488973930478096, -0..."
631,End Event: Temp Define Custom Object (-1),84684,-1,End Event,End Event: Temp Define Custom Object (-1),en,1.0,,"[[0.01773480512201786, 0.023533957079052925, -..."
722,End Event: Create Objects from Shapes... (-1),66378,-1,End Event,End Event: Create Objects from Shapes... (-1),en,1.0,,"[[0.014917957596480846, 0.040976788848638535, ..."
742,End Event: Label Legend Manager... (-1),62979,-1,End Event,End Event: Label Legend Manager... (-1),en,0.967216,,"[[0.014719028025865555, 0.027321459725499153, ..."


In [None]:
processed_data = []
group_data = df_second_results.groupby('id')
for session_id, group_df in tqdm(group_data, desc='Processing set label'):
    matrix = to_matrix(group_df)    
    if calculate_similarity(matrix) > 0.82:
        set_label(group_df)  # Assuming set_label modifies group_df in-place
    else:
        cluster_dbscan(matrix, group_df)  # Perform clustering and label the data
        group_sub_data = group_df.groupby('cluster_label')
        
        processed_sub_data = []
        for _, group_sub_df in group_sub_data:
            set_label(group_sub_df)  # Assuming set_label modifies group_sub_df in-place
            processed_sub_data.append(group_sub_df)
        
        group_df = pd.concat(processed_sub_data)  # Recombine processed subgroup dataframes

    processed_data.append(group_df)

df_final = pd.concat(processed_data)

In [None]:
df_final_test = df_final.drop(columns = 'embedding')

path = '/data/test.parquet'
df_final_test.to_parquet(path)

### Merge df_final with df_first_result

In [None]:
df_first_result = df_first_result.reindex(columns=df_final.columns)
combined_df = pd.concat([df_first_result, df_final], ignore_index=True)

### Save the **command dictionary**

In [None]:
combined_df = combined_df.drop(columns = 'embedding')
path = '/data/command_dictionary.csv'
combined_df.to_csv(path)