# Masked Language Model task for prediction of missing part in assembly

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tqdm import tqdm
from collections import Counter
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertModel, DataCollatorForLanguageModeling
import re

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\grandid\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Pre-process data for task

In [2]:
data_path = "../../data/data_02.feather"
data = pd.read_feather(data_path)

In [3]:
data = data.dropna(subset=["assembly_name", "part_names"])
data = data.drop(columns=["assembly_id", 'assembly_description'])
data.head()

Unnamed: 0,assembly_name,part_names
0,Lava Lamp 2,"[Blob3, Blob2, Blob1, Glass, Cap]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge..."
4,Coffee Mug,"[Mug, Lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Rubber Grip, Gripper Rod..."


In [4]:
# Deduplicate
print(f"Tot: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")
data = data[~data['part_names'].apply(tuple).duplicated()]
print(f"\nAfter dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")

Tot: 88886
Unique: 67834

After dedup: 61725
Unique: 54034


### Clean assembly names

In [5]:
def process_assembly_names(string):
    string = string.replace('.x_t', '')
    string = string.replace('.stp', '')
    string = string.replace('.step', '')
    string = string.replace('.zip', '')
    string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
    string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
    string = string.lower()
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    string = string.replace('*', ' ')
    string = string.replace('copy of', ' ')
    string = string.replace('copy', ' ')
    string = " ".join(string.split())

    return string


data['assembly_name_clean'] = data.apply(lambda row: process_assembly_names(row.assembly_name), axis=1)

In [6]:
print(f"After dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name_clean'].unique())}")

After dedup: 61725
Unique: 49601


### Clean part names

In [7]:
def process_part_names(list):
    part_names = []
    for string in list:
        if "MANIFOLD_SOLID_BREP" in string:
            return np.nan
        string = string.replace('.x_t', '')
        string = string.replace('.stp', '')
        string = string.replace('.step', '')
        string = string.replace('.dwg', '')
        string = string.replace('.zip', '')
        string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
        string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
        string = string.lower()
        string = string.replace('_', ' ')
        string = string.replace('-', ' ')
        string = string.replace('[', ' ')
        string = string.replace(']', ' ')
        string = string.replace('(', ' ')
        string = string.replace(')', ' ')
        string = string.replace('?', ' ')
        string = string.replace('*', ' ')
        string = string.replace('copy of', ' ')
        string = string.replace('copy', ' ')
        string = " ".join(string.split())

        part_names.append(string)

    return part_names


data['part_names_clean'] = data.apply(lambda row: process_part_names(row.part_names), axis=1)
data.dropna(subset=['part_names_clean'], inplace=True)


In [8]:
data.head()

Unnamed: 0,assembly_name,part_names,assembly_name_clean,part_names_clean
0,Lava Lamp 2,"[Blob3, Blob2, Blob1, Glass, Cap]",lava lamp 2,"[blob, blob, blob, glass, cap]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge...",sample headphones,"[pivot hinge, headphone hinge, telescope hinge..."
4,Coffee Mug,"[Mug, Lid]",coffee mug,"[mug, lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]",dave's handsome mug,"[lid, seal, vessel]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Rubber Grip, Gripper Rod...",mechanical pencil,"[eraser, pencil lead, rubber grip, gripper rod..."


## Create strings for fine-tuning

In [9]:
strings = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    if len(row['part_names_clean']) > 0:
        string = f"An assembly named '{row['assembly_name']}' containing the following parts: "
        for part_name in row['part_names']:
            string += f"{part_name}, "
        string = string[:-2] + "."
        strings.append(string)

100%|██████████| 61601/61601 [00:04<00:00, 14612.92it/s]


In [10]:
strings[:10]

["An assembly named 'Lava Lamp 2' containing the following parts: Blob3, Blob2, Blob1, Glass, Cap.",
 "An assembly named 'Sample - Headphones' containing the following parts: Pivot hinge, Headphone hinge, Telescope hinge, Upper band, Headphone speaker.",
 "An assembly named 'Coffee Mug' containing the following parts: Mug, Lid.",
 "An assembly named 'Dave's Handsome Mug' containing the following parts: Lid, Seal, Vessel.",
 "An assembly named 'Mechanical Pencil' containing the following parts: Eraser, Pencil Lead, Rubber Grip, Gripper Rod, Button Release, Lead Gripper.",
 "An assembly named 'OS kinematics' containing the following parts: plate, peg, block, link, link, wheel.",
 "An assembly named 'Torch Light For Bike' containing the following parts: Torch Holder, B18.3.5M - 4 x 0.7 x 10 Socket FCHS  -- 10S, Torch Holder, Head, Default.",
 "An assembly named 'Bottle' containing the following parts: Cap, Bottle_Base.",
 "An assembly named 'Concept Vehicle' containing the following parts

### Train test split

In [11]:
ids = range(len(strings))
train_ids, test_ids = train_test_split(ids, test_size=0.2)
print(f"Length of train: {len(train_ids)}")
print(f"Length of test: {len(test_ids)}")

Length of train: 49280
Length of test: 12321


In [12]:
data.iloc[train_ids].to_csv('../../data/train.csv')
data.iloc[test_ids].to_csv('../../data/test.csv')

In [19]:
errors = 0
with open('../../data/fine_tune_train_strings.txt', 'w', encoding='utf-8') as f:
    for line in tqdm(np.array(strings)[train_ids[:1000]]):
        try:
            f.write(f"{line}\n")
        except UnicodeEncodeError as E:
            errors += 1
print(errors)

100%|██████████| 1000/1000 [00:00<00:00, 1440.88it/s]

0



