# Predicting whether two sets of assembly names are from the same assembly

In [1]:
import os, re
from pathlib import Path
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras import layers
import random
from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split
from pprint import pprint

In [2]:
tf.config.get_visible_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
  except RuntimeError as e:
    # Visible devices must be set at program startup
    print(e)

In [4]:
tf.config.get_visible_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Preprocess data

### Pre-process data for task

In [5]:
data_path = "../data/data_02.feather"
data = pd.read_feather(data_path)

In [6]:
data = data.dropna(subset=["assembly_name", "part_names"])
data = data.drop(columns=["assembly_id", 'assembly_description'])
data.head()

Unnamed: 0,assembly_name,part_names
0,Lava Lamp 2,"[Blob3, Blob2, Blob1, Glass, Cap]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge..."
4,Coffee Mug,"[Mug, Lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Rubber Grip, Gripper Rod..."


In [7]:
# Deduplicate
print(f"Tot: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")
data = data[~data['part_names'].apply(tuple).duplicated()]
print(f"\nAfter dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")

Tot: 88886
Unique: 67834

After dedup: 61725
Unique: 54034


### Clean assembly names

In [8]:
def process_assembly_names(string):
    string = string.replace('.x_t', '')
    string = string.replace('.stp', '')
    string = string.replace('.step', '')
    string = string.replace('.zip', '')
    string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
    string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
    string = string.lower()
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    string = string.replace('*', ' ')
    string = string.replace('copy of', ' ')
    string = string.replace('copy', ' ')
    string = " ".join(string.split())

    return string


data['assembly_name_clean'] = data.apply(lambda row: process_assembly_names(row.assembly_name), axis=1)

In [9]:
print(f"After dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name_clean'].unique())}")

After dedup: 61725
Unique: 49601


### Clean part names

In [10]:
def process_part_names(part_list):
    part_names = []
    for string in part_list:
        if "MANIFOLD_SOLID_BREP" in string:
            return np.nan
        string = string.replace('.x_t', '')
        string = string.replace('.stp', '')
        string = string.replace('.step', '')
        string = string.replace('.dwg', '')
        string = string.replace('.zip', '')
        string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
        string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
        string = string.lower()
        string = string.replace('_', ' ')
        string = string.replace('-', ' ')
        string = string.replace('[', ' ')
        string = string.replace(']', ' ')
        string = string.replace('(', ' ')
        string = string.replace(')', ' ')
        string = string.replace('?', ' ')
        string = string.replace('*', ' ')
        string = string.replace('copy of', ' ')
        string = string.replace('copy', ' ')
        string = " ".join(string.split())

        part_names.append(string)

    random.shuffle(part_names)
    return list(set(part_names))


data['part_names_clean'] = data.apply(lambda row: process_part_names(row.part_names), axis=1)
data.dropna(subset=['part_names_clean'], inplace=True)

In [11]:
data.head()

Unnamed: 0,assembly_name,part_names,assembly_name_clean,part_names_clean
0,Lava Lamp 2,"[Blob3, Blob2, Blob1, Glass, Cap]",lava lamp 2,"[blob, cap, glass]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge...",sample headphones,"[pivot hinge, headphone hinge, telescope hinge..."
4,Coffee Mug,"[Mug, Lid]",coffee mug,"[mug, lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]",dave's handsome mug,"[seal, vessel, lid]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Rubber Grip, Gripper Rod...",mechanical pencil,"[pencil lead, eraser, lead gripper, rubber gri..."


## Create sentences

In [12]:
sentence_pairs = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    num_parts = len(row['part_names_clean'])
    if num_parts > 1:
        sentence_1 = f"An assembly named '{row['assembly_name_clean']}' containing the following parts: "
        for part_name in row['part_names_clean'][:num_parts//2]:
                sentence_1 += f"{part_name}, "

        sentence_2 = ''
        for part_name in row['part_names_clean'][num_parts//2:]:
            sentence_2 += f"{part_name}, "
        sentence_2 = sentence_2[:-2] + "."

        sentence_pairs.append([sentence_1, sentence_2])


100%|██████████| 61601/61601 [00:03<00:00, 18380.88it/s]


In [13]:
random.shuffle(sentence_pairs)
print(len(sentence_pairs))
pprint(sentence_pairs[:10])

40240
[["An assembly named 'train set' containing the following parts: straight, ",
  'cross.'],
 ["An assembly named 'acc profile30' containing the following parts: motor, ",
  '3030.'],
 ["An assembly named 'press' containing the following parts: hsr20r130030 "
  'rail, ',
  'tgmx25 prt1.'],
 ["An assembly named 'foodprints 0.4' containing the following parts: magnet, ",
  'pcb.'],
 ["An assembly named 'antweight robot' containing the following parts: 37 x 22 "
  'offset hub, nano two electronics, ',
  'user library motor, pololu, 2s 260m ah lipo no wire, lego tyre 43 x 22.'],
 ["An assembly named 'pcb vice' containing the following parts: vice endstop, "
  'vice jaw, spring, ',
  'rod, bolt, nut cap, spring former.'],
 ["An assembly named 'sba sheet metal brake a' containing the following parts: "
  '3sba1 clamp, ',
  '2sba1 bending leaf, 1sba1 base.'],
 ["An assembly named 'robot v' containing the following parts: back left bat "
  'holder, main plate, front right bat holder, ',
  

### Train test split

In [14]:
positive_samples = sentence_pairs[:len(sentence_pairs)//2]
negative_samples = sentence_pairs[len(sentence_pairs)//2:]

In [15]:
# Scramble the negative samples while maintaining the sentence pair order
print(negative_samples[1])
negative_sentence_1 = [pair[0] for pair in negative_samples]
negative_sentence_2 = [pair[1] for pair in negative_samples]
random.shuffle(negative_sentence_2)
negative_samples = [list(x) for x in zip(negative_sentence_1, negative_sentence_2)]
print(negative_samples[1])

["An assembly named 'hr carsoft comfort unit 284 connector' containing the following parts: comfort unit lock 284 v1, ", 'comfort unit connector 284 v1.']
["An assembly named 'hr carsoft comfort unit 284 connector' containing the following parts: comfort unit lock 284 v1, ", '6tsb150, twist drill, 2ksabody.']


In [16]:
# Create labels: 1 = positive, 0 = negative
positive_samples = [(sample, 1) for sample in positive_samples]
negative_samples = [(sample, 0) for sample in negative_samples]

sentence_pairs = positive_samples + negative_samples

In [17]:
train, test = train_test_split(sentence_pairs, test_size=0.2)
print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")

Length of train: 32192
Length of test: 8048


In [18]:
train[:5]

[(["An assembly named 'p802qa x axis carriage' containing the following parts: sensor mount, 2x lm8uu mount, mount, carriage, ",
   '8mm x 350mm rod, clamp, belt clamp, belt clip.'],
  1),
 (["An assembly named 'business cardholder' containing the following parts: box, ",
   'flat mounting plate, 41" cross member, 43" cross member.'],
  0),
 (["An assembly named 'crank slide syringe rev2 asm' containing the following parts: machine screw 10 32x 75, short short tube straight, machine screw 8 32x 375, check valve, 2974k353, base plate, 91735a199, nylon bushing, 91735a197, short tube bent, black perp connector, short tube striaght, yellow gear, metal square carriage, metal link 11 holes, sweep, ",
   'cs outer.'],
  0),
 (['An assembly named \'ot linear positioner\' containing the following parts: shcs m4x0.7x22mm, act side mnt plate, shaft bearing, fhscs m4x0.7x12mm, bearing rail, m6x1x22mm, shcs m8x22mm, clevis pin 3/8" x 2", shcs m3x14mm, dog shaft1/4in x 8in, bhscs m5x12mm, mls spacer

In [19]:
# Save out train data
train_data_path = "../data/entailment/train_entailment.csv"
train_df = pd.DataFrame(train)

if not os.path.exists(Path(train_data_path).parent):
    os.mkdir(Path(train_data_path).parent)

train_df.to_csv(train_data_path, index=False)

In [20]:
def preprocess_data(data, tokenizer, max_length=128):
    labels = [item[1] for item in data]
    sentence_pairs = [item[0] for item in data]

    # With BERT tokenizer's batch_encode_plus, sentence pairs are
    # encoded together and separated by [SEP] token.
    encoded = tokenizer.batch_encode_plus(
        sentence_pairs,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True,
        return_tensors="tf"
    )

    # Extract encoded features and labels, add to corresponding lists
    input_ids = np.array(encoded["input_ids"], dtype="int32")
    attention_masks = np.array(encoded["attention_mask"], dtype="int32")
    token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
    
    return [input_ids, attention_masks, token_type_ids], np.array(labels)

## Construct the model

In [21]:
def build_baseline_model(bert_model, max_length=128, hidden_dim=256):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_masks = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_masks')
    token_type_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='token_type_ids')

    bert_output = bert_model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)
    sequence_output = bert_output.last_hidden_state

    attn_output = layers.MultiHeadAttention(num_heads=4, key_dim=100)(sequence_output, sequence_output)
    max_pool = layers.GlobalMaxPooling1D()(attn_output)
    dropout_output = layers.Dropout(0.3)(max_pool)
    final_output = layers.Dense(3, activation="softmax")(dropout_output)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks, token_type_ids],
                                  outputs=[final_output])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [22]:
bert_model_name='bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = TFAutoModel.from_pretrained(bert_model_name)
bert_model.trainable = False

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [23]:
model = build_baseline_model(bert_model, max_length=128, hidden_dim=256)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]',    

In [24]:
class SNLIDataGeneratorFromFile(tf.keras.utils.Sequence):
    def __init__(self,
                 tokenizer,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=32,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()
    
    def __len__(self):
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        train_data_df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)

        train_data = []
        for index, row in train_data_df.iterrows():
            train_data.append((row[0], row[1]))

        batch_data = preprocess_data(
            train_data,
            self.tokenizer,
            self.max_length
        )

        return batch_data
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [25]:
train_data_generator = SNLIDataGeneratorFromFile(
    tokenizer=bert_tokenizer,
    n_examples=len(train),
    data_filename=train_data_path,
    batch_size=16
)

In [26]:
test_data = preprocess_data(
    test, tokenizer=bert_tokenizer, max_length=128
)

In [27]:
checkpoint_dir = '../data/entailment/'
checkpoint_filepath = checkpoint_dir + 'weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

## Train model

In [28]:
tf.config.experimental_run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [None]:
model.fit(train_data_generator, validation_data=test_data, epochs=5,
          callbacks=[model_checkpoint_callback])

Epoch 2/5