# Masked Language Model task for prediction of missing part in assembly

In [92]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tqdm import tqdm
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import BertTokenizer, TFBertModel, DataCollatorForLanguageModeling

### Pre-process data for task

In [28]:
data_path = "data/data.feather"
data = pd.read_feather(data_path)

In [29]:
data = data.dropna(subset=["assembly_name", "part_names"])
data = data.drop(columns=["assembly_id", 'assembly_description'])
data.head()

Unnamed: 0,assembly_name,part_names
0,Lava Lamp 2,"[Cap, Blob2, Glass, Blob3, Blob1]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge..."
4,Coffee Mug,"[Mug, Lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Button Release, Rubber G..."


In [76]:
lengths = []
for index, row in data.iterrows():
    lengths.append(len(row['part_names']))

Counter(lengths).most_common( )

[(1, 33589),
 (2, 9522),
 (5, 6686),
 (3, 5960),
 (4, 4765),
 (6, 2926),
 (10, 2664),
 (8, 1992),
 (7, 1989),
 (9, 1403),
 (11, 1060),
 (12, 1016),
 (13, 795),
 (14, 708),
 (15, 701),
 (30, 587),
 (17, 506),
 (16, 479),
 (25, 459),
 (18, 449),
 (41, 419),
 (20, 418),
 (26, 415),
 (22, 395),
 (19, 385),
 (21, 325),
 (23, 278),
 (24, 250),
 (27, 236),
 (31, 234),
 (35, 231),
 (77, 218),
 (32, 216),
 (73, 205),
 (29, 197),
 (84, 186),
 (28, 173),
 (34, 170),
 (202, 149),
 (33, 148),
 (36, 147),
 (40, 142),
 (37, 139),
 (134, 125),
 (44, 124),
 (46, 119),
 (48, 118),
 (39, 111),
 (126, 103),
 (38, 101),
 (45, 96),
 (727, 95),
 (75, 92),
 (43, 89),
 (94, 86),
 (42, 84),
 (64, 80),
 (66, 79),
 (61, 79),
 (50, 78),
 (47, 76),
 (59, 74),
 (49, 72),
 (62, 71),
 (122, 66),
 (63, 66),
 (52, 65),
 (80, 63),
 (60, 63),
 (65, 60),
 (72, 58),
 (54, 57),
 (51, 56),
 (58, 55),
 (55, 55),
 (165, 54),
 (57, 51),
 (101, 48),
 (76, 47),
 (133, 44),
 (53, 44),
 (106, 39),
 (74, 38),
 (78, 38),
 (70, 37),
 (

From the raw data only keep assemblies that have more than 1 part, create strings that can be fed into the model, clean up the strings to remove unnecessary characters, and make lowercase.

In [43]:

def process_string(string):
    string = string.lower()
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    return string

strings = []
for index, row in tqdm(data.iterrows(), total=len(data)):
    if len(row['part_names']) > 1:
        string = f"In an assembly called '{row['assembly_name']}' there are the following parts: "
        for part_name in row['part_names']:
            string += f"{part_name}, "
        string = process_string(string[:-2] + ".")
        strings.append(string)

100%|██████████| 88886/88886 [00:05<00:00, 16850.43it/s]


In [45]:
print(len(strings))
strings[:10]

55297


["in an assembly called 'lava lamp 2' there are the following parts: cap, blob2, glass, blob3, blob1.",
 "in an assembly called 'sample   headphones' there are the following parts: pivot hinge, headphone hinge, telescope hinge, upper band, headphone speaker.",
 "in an assembly called 'coffee mug' there are the following parts: mug, lid.",
 "in an assembly called 'dave's handsome mug' there are the following parts: lid, seal, vessel.",
 "in an assembly called 'mechanical pencil' there are the following parts: eraser, pencil lead, button release, rubber grip, lead gripper, gripper rod.",
 "in an assembly called 'os kinematics' there are the following parts: plate, block, peg, link, wheel.",
 "in an assembly called 'torch light for bike' there are the following parts: torch holder, b18.3.5m   4 x 0.7 x 10 socket fchs     10s, head, torch holder, default.",
 "in an assembly called 'bottle' there are the following parts: cap, bottle base.",
 "in an assembly called 'concept vehicle' there ar

In [49]:
train_strings, test_strings = train_test_split(strings, test_size=0.2)
print(f"Length of train: {len(train_strings)}")
print(f"Length of test: {len(test_strings)}")

Length of train: 44237
Length of test: 11060


In [51]:
length = 0
for i, string in enumerate(strings):
    if len(string) > length:
        length = len(string)
        print(i)
print(length)

0
1
6
8
73
74
106
112
594
886
2230
36475
37169
138858


In [61]:
strings[594]

"in an assembly called 'redback spyder' there are the following parts: filter, gudeon pin, floor pan side, straight pipe, nut : standard   0.3125 x 6.7469 16, nut : standard   0.5 x 7.9375, nut : standard   m14.0 x 201.6125, rear hinge support top plate, bolt : hex   0.3125 x 1.25 29, cr 09, co 06, part34, muffler, part33, gear shift housing, part10, bolt : hex   0.3125 x 1.25 27, part61 3, wheel front inner rim, nut : standard   0.3125 x 6.7469 27, part21, wheel spinner, gear box mount plate, part38 3, swivel, floor pan, part32, co 21, part61, spring, nut : standard   0.3125 x 6.7469 19, air conditioning unit, cc 01, dash support joiner 19, sheet metal part27 1, anti rollbar blade, nut : self locking   m12.0 x 268.859, drive pin 2, nut : standard   0.3125 x 6.7469 1, cr 30, part17 2, cr 20, nut : standard   0.3125 x 6.7469 11, dash support joiner 3, steering wheel gear paddle microswitch, panel light, fan, bolt : hex   0.3125 x 1.25 23, cc 23, nut : standard   m6.0 x 77.47, floor pan 

Tokenize the data.

In [81]:
max_length = 128

model_checkpoint = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [90]:
train_encodings = bert_tokenizer(train_strings, truncation=True, padding=True, max_length=max_length, return_tensors='tf', return_special_tokens_mask=True)
test_encodings = bert_tokenizer(test_strings, truncation=True, padding=True, max_length=max_length, return_tensors='tf', return_special_tokens_mask=True)

In [85]:
train_encodings.input_ids[:2]

<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[  101,  1999,  2019,  3320,  2170,  1005, 15333,  2226,  2139,
         2810,  4241,  6948,  2475,  2072,  6100,  1005,  2045,  2024,
         1996,  2206,  3033,  1024, 23189,  2015,  1010, 23189,  1010,
         2490,  1016,  5013,  1032,  1060,  2475,  1032,  4002,  2063,
         2683,  1032,  1060,  2692,  1032, 23245, 11236,  1010, 23189,
         1009, 12528,  1010, 23189,  1009, 12528,  1010, 23189,  1010,
        23189,  1010, 23189,  1009, 12528,  1010, 23189,  1010, 23189,
         2015,  1010, 23189,  1009, 12528,  1010, 23189,  1010,  1043,
         6856,  5332,  2128,  1009, 12528,  1010,  1016, 23189,  2015,
         1009, 12528,  1010, 23189,  1009, 12528,  1010, 23189,  1009,
        12528,  1010, 12946,  9587,  2618,  3126,  1010, 12528,  1010,
        23189,  1010, 12528,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [100]:
bert_tokenizer.decode(train_encodings['input_ids'][1])

"[CLS] in an assembly called'skidwaterair. stp'there are the following parts : 150, 8, 120, 60, 201, 76, 173, 27, 43, 154, 73, 32, 159, 214, 193, 171, 110, 156, 114, 164, 84, 36, 162, 45, 123, 23, 115, 139, 46, 205, 81, 192, 79, 212, 204, 62, 101, 122, 206, 163, 24, 65, 44, 183, 215, 140, 75, 94, 100, 180, 107, 25, 144, 3 [SEP]"

Next, we will mask the data randomly.

In [129]:
data_collator = DataCollatorForLanguageModeling(tokenizer=bert_tokenizer, mlm_probability=0.15)
# TODO: modify the data collator to avoid the words that are always the same in the input sentences

In [132]:
samples = [train_encodings['input_ids'][i] for i in range(2)]


for chunk in data_collator(samples, return_tensors='tf')["input_ids"]:
    print(f"\n'>>> {bert_tokenizer.decode(chunk)}'")


'>>> [CLS] in an assembly called'je [MASK] de construction du lp2i edo'there are the [MASK] parts : barres, barre, support 2 motor \ [MASK]2 \ 00e [MASK] \ x0 \ [MASK]eur [MASK] barre + clip, barre + clip, barre, barre [MASK] barre + [MASK], barre, barres [MASK] barre + clip, barre [MASK] glissi re + clip, 2 barres + clip, barre + clip, [MASK] + clip, axe moteur, clip [MASK] barre, clip [MASK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] in an assembly called [MASK] skidwaterair. stp'[MASK] are the following parts chatting 150, 8 [MASK] 120, 60, 201, [MASK] [MASK] 173, 27, 43 [MASK] 154, 73, 32, 159, 214, 193, [MASK], 110, [MASK], 114, 164 [MASK] 84, 36, 162, 45, [MASK], 23, 115, 139 [MASK] 46, 205, 81, [MASK], 79, 212 [MASK] 204, [MASK], 101, 122, 206 [MASK] 163, 24, 65, 44, [MASK], 215, [MASK], 75, 94, 100, 180, 107, 25, 144, 3 [SEP]'
