In [None]:
import os
import pandas as pd
from tqdm import tqdm
from google.cloud import translate_v3 as translate
from collections import Counter
from google.api_core.exceptions import GoogleAPICallError, RetryError

### Read **Actual Modeling Logs**, get the counts of the unique commands

In [None]:
actual_modeling_logs_df = pd.read_parquet('data/actual_modeling_Logs.parquet')
message_list = actual_modeling_logs_df['message'].to_list
counts = Counter(actual_modeling_logs_df['message'])
combined_message_count = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
combined_message_count = combined_message_count.reset_index().rename(columns={'index': 'message'})
combined_message_count = combined_message_count.sort_values(by = 'count', ascending = False)

### Drop commands less than 10 times

In [None]:
# Duplicate datasets discarded to obtain unique commands and count the number of times each command occurs
print(combined_message_count)

                                      count
message                                    
End Event: Nudge (5)               67182026
End Event: Drag (75)               43121714
Tool: Reshape (-214)               29945982
End Event: Delete (58)             29943412
End Event: Shape Pane Edit (89)    21692112
...                                     ...
Menu: ExportArtlantis -  (69) (0)         1
Menu: Document Windows - 5EC6225E         1
Menu: Document Windows - 5EC8F21E         1
Menu: ExportArtlantis -  (49) (0)         1
Menu: Document Windows - 5691523F         1

[67768 rows x 1 columns]


In [None]:
# Drop the commands which occurs less than 10 times
df_unique_commands = combined_message_count[combined_message_count['count'] > 10]   
print(df_unique_commands)

                                                 message     count
0                                   End Event: Nudge (5)  67182026
1                                   End Event: Drag (75)  43121714
2                                   Tool: Reshape (-214)  29945982
3                                 End Event: Delete (58)  29943412
4                        End Event: Shape Pane Edit (89)  21692112
...                                                  ...       ...
21107  Menu: CExtMenuDatabase - Externe Datenquelle a...        11
21108  Menu: XG GroupWithConnectedFittings Menu -  (3...        11
21109                  Menu: Document Windows - 98EE76C9        11
21110  End Event: Material „Stahl 11, gewellt“ wird i...        11
21111                  End Event: Rename Symbol Def (-1)        11

[21112 rows x 2 columns]


### Extract commands' IDs, Categories.

In [31]:
import re
def extract_loc_id(df):
    #ids of the message
    ids = []
    for index, row in df.iterrows():
        message = row['message']
        ids.append(id)
    df['id'] = ids
    # convert id to string
    df['id'] = df['id'].apply(lambda x: ','.join(map(str, x)))
    return df
def extract_cat(df):
    # Vectorized extraction of categories
    df['cat'] = df['message'].str.extract(r'^(.*?):')
    return df

In [32]:
df_unique_commands = extract_loc_id(df_unique_commands)
df_unique_commands = extract_cat(df_unique_commands)
print(df_unique_commands)

                                                 message     count     id  \
0                                   End Event: Nudge (5)  67182026      5   
1                                   End Event: Drag (75)  43121714     75   
2                                   Tool: Reshape (-214)  29945982   -214   
3                                 End Event: Delete (58)  29943412     58   
4                        End Event: Shape Pane Edit (89)  21692112     89   
...                                                  ...       ...    ...   
21107  Menu: CExtMenuDatabase - Externe Datenquelle a...        11   47,4   
21108  Menu: XG GroupWithConnectedFittings Menu -  (3...        11  345,0   
21109                  Menu: Document Windows - 98EE76C9        11          
21110  End Event: Material „Stahl 11, gewellt“ wird i...        11    166   
21111                  End Event: Rename Symbol Def (-1)        11     -1   

             cat  
0      End Event  
1      End Event  
2           Tool  

### Remove the commands with id ' ' (Meaningless commands)

In [41]:
mask = df_unique_commands['id'] == ''
df_commands = df_unique_commands[~mask]
print(df_commands)

                                                 message     count     id  \
0                                   End Event: Nudge (5)  67182026      5   
1                                   End Event: Drag (75)  43121714     75   
2                                   Tool: Reshape (-214)  29945982   -214   
3                                 End Event: Delete (58)  29943412     58   
4                        End Event: Shape Pane Edit (89)  21692112     89   
...                                                  ...       ...    ...   
21106                Menu: WorkspacesDialog -  (202) (0)        11  202,0   
21107  Menu: CExtMenuDatabase - Externe Datenquelle a...        11   47,4   
21108  Menu: XG GroupWithConnectedFittings Menu -  (3...        11  345,0   
21110  End Event: Material „Stahl 11, gewellt“ wird i...        11    166   
21111                  End Event: Rename Symbol Def (-1)        11     -1   

             cat  
0      End Event  
1      End Event  
2           Tool  

### Translate commands to English and detect language using google API|

In [34]:
def translate_detect_text(commands, target_language='en'):
    project_id = 'project_id'
    location = 'global'
    client = translate.TranslationServiceClient()
    parent = f"projects/{project_id}/locations/{location}"

    translated_commands = []
    language_results = []

    for command in tqdm(commands, desc="Translating and detecting"):
        try:
            # Client for translation
            response_translation = client.translate_text(
                contents=[command],
                target_language_code=target_language,
                parent=parent
            )

            # Client for language detection
            response_detection = client.detect_language(
                content=command,
                parent=parent
            )
            
            # Get the translated language
            translated_text = response_translation.translations[0].translated_text
            translated_commands.append(translated_text)
            
            # Best guess of the languages detected
            detections = response_detection.languages
            if detections:
                best_guess_language = detections[0].language_code
                confidence = detections[0].confidence
            else:
                best_guess_language = 'undetermined'
                confidence = 0.0

            language_results.append((best_guess_language, confidence))

        except (GoogleAPICallError, RetryError) as e:
            print(f"An error occurred during translation and detection: {e}")
            translated_commands.append('Translation Failed')  # Placeholder for failed translations
            language_results.append((command, 'undetermined', 0.0))

    return translated_commands, language_results

In [35]:
# Access Google Cloud service account key file 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Google Cloud service accound key file.json'

In [36]:
commands_list = df_commands.message.to_list()

In [9]:
translated_commands, language_detection_results = translate_detect_text(commands_list)

Translating and detecting: 100%|██████████| 20047/20047 [54:00<00:00,  6.19it/s]


In [None]:
language_df = pd.DataFrame(language_detection_results, columns=['message_language', 'language_confidence'])
message_language = language_df['message_language'].to_list()
language_confidence = language_df['language_confidence'].to_list()

df_commands['translated_message'] = translated_commands
df_commands['message_language'] = message_language
df_commands['language_confidence'] = language_confidence

### Save the translated commands

In [None]:
path = '/data/preprocessed_language_dic.parquet'
df_commands.to_parquet(path)