# Masked Language Model task for prediction of missing part in assembly

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tqdm import tqdm
from collections import Counter
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertModel, DataCollatorForLanguageModeling
import re

nltk.download('averaged_perceptron_tagger')

### Pre-process data for task

In [None]:
data_path = "../../data/data_02.feather"
data = pd.read_feather(data_path)

In [None]:
data = data.dropna(subset=["assembly_name", "part_names"])
data = data.drop(columns=["assembly_id", 'assembly_description'])
data.head()

In [None]:
# Deduplicate
print(f"Tot: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")
data = data[~data['part_names'].apply(tuple).duplicated()]
print(f"\nAfter dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")

### Clean assembly names

In [None]:
def process_assembly_names(string):
    string = string.replace('.x_t', '')
    string = string.replace('.stp', '')
    string = string.replace('.step', '')
    string = string.replace('.zip', '')
    string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
    string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
    string = string.lower()
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    string = string.replace('*', ' ')
    string = string.replace('copy of', ' ')
    string = string.replace('copy', ' ')
    string = " ".join(string.split())

    return string


data['assembly_name_clean'] = data.apply(lambda row: process_assembly_names(row.assembly_name), axis=1)

In [None]:
print(f"After dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name_clean'].unique())}")

### Clean part names

In [None]:
def process_part_names(list):
    part_names = []
    for string in list:
        if "MANIFOLD_SOLID_BREP" in string:
            return np.nan
        string = string.replace('.x_t', '')
        string = string.replace('.stp', '')
        string = string.replace('.step', '')
        string = string.replace('.dwg', '')
        string = string.replace('.zip', '')
        string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
        string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
        string = string.lower()
        string = string.replace('_', ' ')
        string = string.replace('-', ' ')
        string = string.replace('[', ' ')
        string = string.replace(']', ' ')
        string = string.replace('(', ' ')
        string = string.replace(')', ' ')
        string = string.replace('?', ' ')
        string = string.replace('*', ' ')
        string = string.replace('copy of', ' ')
        string = string.replace('copy', ' ')
        string = " ".join(string.split())

        part_names.append(string)

    return part_names


data['part_names_clean'] = data.apply(lambda row: process_part_names(row.part_names), axis=1)
data.dropna(subset=['part_names_clean'], inplace=True)


In [None]:
data.head()

## Create strings for fine-tuning

In [None]:
strings = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    if len(row['part_names_clean']) > 0:
        string = f"An assembly named '{row['assembly_name']}' containing the following parts: "
        for part_name in row['part_names']:
            string += f"{part_name}, "
        string = string[:-2] + "."
        strings.append(string)

In [None]:
strings[:10]

### Train test split

In [None]:
ids = range(len(strings))
train_ids, test_ids = train_test_split(ids, test_size=0.2)
print(f"Length of train: {len(train_ids)}")
print(f"Length of test: {len(test_ids)}")

In [None]:
data.iloc[train_ids].to_csv('../../data/train.csv')
data.iloc[test_ids].to_csv('../../data/test.csv')

In [None]:
errors = 0
with open('../../data/fine_tune_train_strings.txt', 'w', encoding='utf-8') as f:
    for line in tqdm(np.array(strings)[train_ids]):
        try:
            f.write(f"{line}\n")
        except UnicodeEncodeError as E:
            errors += 1
print(errors)