# Clean assembly names in an attempt to reduce unique names

In [174]:
import pandas as pd
from collections import Counter
import re
from tqdm import tqdm
from transformers import BertTokenizer, TFBertModel
from pprint import pprint

### Pre-process data for task

In [175]:
data_path = "data/data_02.feather"
data = pd.read_feather(data_path)

In [176]:
data = data.dropna(subset=["assembly_name", "part_names"])
data = data.drop(columns=["assembly_id", 'assembly_description']) #, 'part_names'])
data.head()

Unnamed: 0,assembly_name,part_names
0,Lava Lamp 2,"[Blob3, Blob2, Blob1, Glass, Cap]"
1,Sample - Headphones,"[Pivot hinge, Headphone hinge, Telescope hinge..."
4,Coffee Mug,"[Mug, Lid]"
5,Dave's Handsome Mug,"[Lid, Seal, Vessel]"
9,Mechanical Pencil,"[Eraser, Pencil Lead, Rubber Grip, Gripper Rod..."


In [177]:
print(f"Tot: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")

Tot: 88886
Unique: 67834


In [178]:
# Deduplicate
data = data[~data['part_names'].apply(tuple).duplicated()]
print(f"After dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name'].unique())}")

After dedup: 61725
Unique: 54034


In [179]:
counts = Counter(data['assembly_name'])
counts.most_common()[:100]

[('Untitled document', 1052),
 ('FarmBot Genesis - Copy', 256),
 ('Onshape Instructor Kit - 4.2.2 - Creating and Adding Subassemblies - Copy',
  174),
 ('EEZYbotARM_MK2 - Copy', 158),
 ('Test', 126),
 ('test', 119),
 ('PERRINN F1 aero - Copy', 114),
 ('Tutorial 1 - Onshape Tour - Copy', 82),
 ('SmartrapCoreBASE - Smartfriendz - Copy', 82),
 ('PERRINN LMP1 - Copy', 61),
 ('Box', 40),
 ('College - Week 4 - Copy', 39),
 ('Violin - Copy', 38),
 ('Poppy humanoid v1.0.1 - Copy', 36),
 ('EVEZOR Robotic Arm - Copy', 35),
 ('Aerofoil Script - Copy', 34),
 ('Prusa i3 MK2 by TOTO-G - Copy', 34),
 ('Robot', 31),
 ('My Mechanical Project', 31),
 ('Desk', 30),
 ('CNC', 28),
 ('3D Printer', 28),
 ('Wheel Grinder - Copy', 28),
 ('DrawingMachine - Copy', 28),
 ('Table', 27),
 ('Spool Holder', 25),
 ('Bullet Feeder - Copy', 25),
 ('Copy of Tutorial 1 - Onshape Tour', 23),
 ('Lulzbot Taz 6 Stock - Copy', 23),
 ('House', 22),
 ('Concept Vehicle - Copy', 22),
 ('spinner', 22),
 ('Workbench', 21),
 ('G2 Hel

In [184]:
def process_string(string):

    string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
    string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
    string = string.lower()
    string = string.replace('.x_t', '')
    string = string.replace('.stp', '')
    string = string.replace('.step', '')
    string = string.replace('.zip', '')
    string = string.replace('.dwg', '')
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    string = string.replace('*', ' ')
    string = string.replace('copy of', ' ')
    string = string.replace('copy', ' ')
    string = " ".join(string.split())

    return string


data['assembly_name_clean'] = data.apply(lambda row: process_string(row.assembly_name), axis=1)

In [185]:
print(f"After dedup: {len(data)}")
print(f"Unique: {len(data['assembly_name_clean'].unique())}")

counts = Counter(data['assembly_name_clean'])
counts.most_common()[:100]

After dedup: 61725
Unique: 49644


[('untitled document', 1094),
 ('test', 353),
 ('farm bot genesis', 277),
 ('onshape instructor kit 4.2.2 creating and adding subassemblies', 197),
 ('eez ybot arm mk2', 169),
 ('perrinn f aero', 135),
 ('tutorial 1 onshape tour', 110),
 ('smartrap core base smartfriendz', 87),
 ('perrinn lmp', 82),
 ('box', 78),
 ('robot', 66),
 ('3d printer', 57),
 ('p', 50),
 ('cnc', 49),
 ('violin', 48),
 ('table', 45),
 ('college week 4', 45),
 ('spinner', 45),
 ('house', 44),
 ('spool holder', 42),
 ('desk', 41),
 ('poppy humanoid v1.0.1', 41),
 ('wheel grinder', 39),
 ('prusa i mk2 by toto g', 39),
 ('aerofoil script', 39),
 ('my mechanical project', 39),
 ('concept vehicle', 38),
 ('frame', 37),
 ('evezor robotic arm', 36),
 ('case', 34),
 ('drawing machine', 32),
 ('workbench', 31),
 ('clock', 29),
 ('scooter', 29),
 ('perrinn f 2016', 28),
 ('perrinn f', 28),
 ('m', 27),
 ('gear', 27),
 ('official smartrap core smartfriendz', 27),
 ('bullet feeder', 27),
 ('sample engine', 25),
 ('assembly', 

# Clean Part Names

In [117]:
part_names = data.explode('part_names')

In [119]:
print(f"Tot: {len(part_names)}")
print(f"Unique: {len(part_names['part_names'].unique())}")

Tot: 1598870
Unique: 405136


In [120]:
counts = Counter(part_names['part_names'])
counts.most_common()[:100]

[('MANIFOLD_SOLID_BREP #15', 30019),
 ('P-08-001-A Fuel tank', 23920),
 ('Document', 7943),
 ('1', 6592),
 ('MANIFOLD_SOLID_BREP #323757', 5956),
 ('Extruded', 5822),
 ('Open-CR TP2.0', 4867),
 ('MANIFOLD_SOLID_BREP #268734', 4516),
 ('P-09-001-A Engine', 4515),
 ('MANIFOLD_SOLID_BREP #199421', 4272),
 ('PCB', 3677),
 ('MANIFOLD_SOLID_BREP #18492', 3540),
 ('MANIFOLD_SOLID_BREP #525570', 3444),
 ('1P65fa', 3393),
 ('MANIFOLD_SOLID_BREP_202_1_5_1', 3384),
 ('53.8169.00_1_1\\53.1368.00_1_1\\KT.53.1368_03.00_1\\MANIFOLD_SOLID_BREP #125345',
  3214),
 ('53.8169.00_1_1\\53.1368.00_1_1\\KT.53.1368_03.00_2\\MANIFOLD_SOLID_BREP #125345',
  3214),
 ('PDP_rev_1.0', 3135),
 ('MANIFOLD_SOLID_BREP #73620', 2830),
 ('ramps-1.4', 2804),
 ('None', 2759),
 ('    MSBR', 2692),
 ('Shikari', 2481),
 ('PartBody', 2472),
 ('MANIFOLD_SOLID_BREP #9766', 2466),
 ('MANIFOLD_SOLID_BREP #219', 2448),
 ('MANIFOLD_SOLID_BREP #221', 2366),
 ('MANIFOLD_SOLID_BREP #34654', 2088),
 ('MANIFOLD_SOLID_BREP #149055', 2088)

In [124]:
def process_string(string):
    if "MANIFOLD_SOLID_BREP" in string:
        return "MANIFOLD_SOLID_BREP"
    string = string.replace('.x_t', '')
    string = string.replace('.stp', '')
    string = string.replace('.step', '')
    string = string.replace('.zip', '')
    string = ' '.join(re.findall('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', string))  # splits camelCase into camel case
    string = ' '.join(re.split('(\s+|^)([A-Za-z]+)\d+(\s+|$)', string))  # removes number at the end
    string = string.lower()
    string = string.replace('_', ' ')
    string = string.replace('-', ' ')
    string = string.replace('[', ' ')
    string = string.replace(']', ' ')
    string = string.replace('(', ' ')
    string = string.replace(')', ' ')
    string = string.replace('?', ' ')
    string = string.replace('*', ' ')
    string = string.replace('copy of', ' ')
    string = string.replace('copy', ' ')
    string = " ".join(string.split())

    return string


part_names['part_names_clean'] = part_names.apply(lambda row: process_string(row.part_names), axis=1)

In [143]:
print(f"Tot: {len(part_names)}")
print(f"Unique: {len(part_names['part_names_clean'].unique())}")

counts = Counter(part_names['part_names_clean'])
most_common_part_names = counts.most_common()[:100]
pprint(most_common_part_names)

Tot: 1598870
Unique: 365088
[('MANIFOLD_SOLID_BREP', 179109),
 ('p 08 001 a fuel tank', 23920),
 ('document', 7943),
 ('1', 6699),
 ('extruded', 5867),
 ('open cr tp2.0', 4867),
 ('solid', 4606),
 ('p 09 001 a engine', 4515),
 ('pcb', 3840),
 ('1p65fa', 3663),
 ('pdp rev 1.0', 3135),
 ('part', 3055),
 ('ramps 1.4', 2805),
 ('none', 2759),
 ('msbr', 2719),
 ('pin', 2690),
 ('shikari', 2481),
 ('part body', 2472),
 ('you08747e893bn9', 2022),
 ('4duino picaso rev1.2 pcb sub part', 1984),
 ('shell', 1968),
 ('extruded 1', 1936),
 ('raspberry pi 3', 1902),
 ('p 09 006 a exhaust left', 1890),
 ('p 09 007 a exhaust right', 1890),
 ('c', 1717),
 ('4dcape 70 rev1.00 outline', 1688),
 ('tire marking', 1608),
 ('tire rear', 1606),
 ('302 1260 wiper stp curved scaled', 1605),
 ('nucleo f411re pcb top designators varsay\\x2\\008d\\x0\\lan', 1596),
 ('treppenstufe', 1542),
 ('shaft', 1509),
 ('open cascade step translator 6.8 1.4', 1432),
 ('vane', 1428),
 ('5972k222', 1392),
 ('cx5p all cat part', 

### BERT Tokenizer Stats

In [134]:
vocab = []
for index, row in tqdm(part_names.iterrows(), total=len(part_names)):
    vocab.append(row['part_names_clean'])

1598870it [01:03, 25325.52it/s]


In [128]:
model_checkpoint = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
part_names_encoded = bert_tokenizer(vocab)

In [148]:
counts = Counter([item for sublist in part_names_encoded.input_ids for item in sublist])
print(f"vocab size: {len(vocab)}")
print(f"tot number of unique tokens: {len(counts)}")
print(f"number of [CLS]: {counts[101]}")
print(f"number of [SEP]: {counts[102]}")
print(f"number of [PAD]: {counts[0]}")
print(f"number of [UNK]: {counts[100]}")

vocab size: 1598870
tot number of unique tokens: 11466
number of [CLS]: 1598870
number of [SEP]: 1598870
number of [PAD]: 0
number of [UNK]: 1


In [149]:
counts.most_common()

[(101, 1598870),
 (102, 1598870),
 (1035, 358218),
 (2692, 296572),
 (1032, 290185),
 (1060, 199377),
 (5024, 188959),
 (7987, 181799),
 (13699, 180983),
 (19726, 179661),
 (2475, 177633),
 (1037, 161834),
 (8889, 159446),
 (1012, 152652),
 (1015, 149923),
 (1052, 146870),
 (2595, 116468),
 (2063, 107025),
 (4002, 103071),
 (2620, 95072),
 (2487, 94270),
 (1016, 84785),
 (2575, 78576),
 (2509, 77260),
 (2629, 74507),
 (2683, 72736),
 (2546, 67722),
 (2581, 66519),
 (2549, 63903),
 (5890, 62980),
 (2361, 59082),
 (1049, 58515),
 (2497, 54092),
 (2094, 53290),
 (1017, 53129),
 (1018, 48058),
 (25604, 47576),
 (2099, 46486),
 (2050, 45927),
 (6185, 43669),
 (1019, 43406),
 (2278, 42307),
 (7382, 39391),
 (1022, 39049),
 (4654, 38687),
 (1020, 38147),
 (16344, 37396),
 (2213, 34670),
 (1014, 34335),
 (1054, 33331),
 (2121, 32496),
 (2340, 32259),
 (5511, 32079),
 (14142, 31141),
 (13936, 30009),
 (11224, 29737),
 (6021, 28904),
 (2102, 28700),
 (1038, 28223),
 (4762, 25946),
 (17914, 25593

In [150]:
for token_id in [i[0] for i in counts.most_common()[:100]]:
    print(bert_tokenizer.decode(token_id))

[ C L S ]
[ S E P ]
_
# # 0
\
x
s o l i d
b r
# # e p
m a n i f o l d
# # 2
a
# # 0 0
.
1
p
# # x
# # e
0 0
# # 8
# # 1
2
# # 6
# # 3
# # 5
# # 9
# # f
# # 7
# # 4
0 1
# # p
m
# # b
# # d
3
4
0 0 1
# # r
# # a
0 2
5
# # c
# # m m
8
e x
6
# # t r
# # m
0
r
# # e r
1 1
0 8
# # 3 0
# # u d e d
s c r e w
0 3
# # t
b
f u e l
# # 8 0
t a n k
v
r e v
s u b
p a r t
0 6
p l a t e
c
p i n
l
# # s
0 4
# # l
0 0 0
l e f t
# # n
1 0
# # 6 0
p i
z
o p e n
r e a r
r i g h t
# # k
s t e p
d
n u t
f
2 0
# # 1 0
# # 1 0 0
# # 2 0
p c
,
t
# # e d
h
# # 1 6
7


In [158]:
most_common_part_names_encoded = bert_tokenizer([item[0] for item in most_common_part_names])

In [160]:
bert_tokenizer.decode(most_common_part_names_encoded['input_ids'])

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

In [172]:
for i, part_name_encoded in enumerate(most_common_part_names_encoded['input_ids']):
    print(f"original: {most_common_part_names[i]} | encoded: {most_common_part_names_encoded['input_ids'][i][1:-1]} | decoded: '{bert_tokenizer.decode(part_name_encoded)[6:-6]}'")

original: ('MANIFOLD_SOLID_BREP', 179109) | encoded: [19726, 1035, 5024, 1035, 7987, 13699] | decoded: 'manifold _ solid _ brep'
original: ('p 08 001 a fuel tank', 23920) | encoded: [1052, 5511, 25604, 1037, 4762, 4951] | decoded: 'p 08 001 a fuel tank'
original: ('document', 7943) | encoded: [6254] | decoded: 'document'
original: ('1', 6699) | encoded: [1015] | decoded: '1'
original: ('extruded', 5867) | encoded: [4654, 16344, 13936] | decoded: 'extruded'
original: ('open cr tp2.0', 4867) | encoded: [2330, 13675, 1056, 2361, 2475, 1012, 1014] | decoded: 'open cr tp2. 0'
original: ('solid', 4606) | encoded: [5024] | decoded: 'solid'
original: ('p 09 001 a engine', 4515) | encoded: [1052, 5641, 25604, 1037, 3194] | decoded: 'p 09 001 a engine'
original: ('pcb', 3840) | encoded: [7473, 2497] | decoded: 'pcb'
original: ('1p65fa', 3663) | encoded: [1015, 2361, 26187, 7011] | decoded: '1p65fa'
original: ('pdp rev 1.0', 3135) | encoded: [22851, 2361, 7065, 1015, 1012, 1014] | decoded: 'pdp r

In [151]:
counts = Counter([item for sublist in most_common_part_names_encoded.input_ids for item in sublist])
print(f"vocab size: {len(vocab)}")
print(f"tot number of unique tokens: {len(counts)}")
print(f"number of [CLS]: {counts[101]}")
print(f"number of [SEP]: {counts[102]}")
print(f"number of [PAD]: {counts[0]}")
print(f"number of [UNK]: {counts[100]}")

vocab size: 1598870
tot number of unique tokens: 285
number of [CLS]: 100
number of [SEP]: 100
number of [PAD]: 0
number of [UNK]: 0


In [173]:
# TODO: Look at word frequency, not just part names
# TODO: see whether MINFORLD_BREP_OBDY shows up with custom names, otherwise drop.

bert_tokenizer.tokenize('This tree is 1253 years old.')


['this', 'tree', 'is', '125', '##3', 'years', 'old', '.']