In [57]:
import pandas as pd

df = pd.read_csv(r'data/pandas_df2.csv', dtype='str').iloc[:1000]
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Product_ID,Product_UserTypeID,AT_MaraBshmCax3D,AT_MaraNtgew,AT_MaraVolum,AT_SAPID,AT_MaraMatkl,AT_MaraNumtp,...,AT_MaraLabor,AT_MaraExtwg,AT_MaraBismt,AT_MaraMeins,AT_MaraMtart,AT_MaraGroes,AT_LeadingSystem,AT_MaraNtgew_UnitID,AT_MaraVolum_UnitID,AT_MaraBrgew_UnitID
0,1,9001422279 - Lens Head Screw M4x8,MAT_928319,Material,60100004395930.0,1.3,0.0,9001422279,H002 - Customized screws,,...,"H10 - FCGB Inserra, Bortolo",,5750172759.0,piece(s),Materials which are externally procured,,SAP,unece.unit.GRM,unece.unit.DMQ,unece.unit.GRM
1,2,9001730689 - CP body laser printed WT45HV14FR,MAT_928320,Material,,0.318,0.0,9001730689,R239 - xxx - do not use anymore,prefabricated parts,...,WD0 - FLCD Default (FDW),,9001452855.0,piece(s),Materials which can be potentially produced in...,,SAP,unece.unit.KGM,unece.unit.CMQ,unece.unit.KGM
2,3,"9001730699 - USER MA. SE (WT45HV14FR), FR",MAT_928323,Material,,0.02,0.0,9001730699,X810 - Printed material (general),,...,WD0 - FLCD Default (FDW),,9001452871.0,piece(s),Materials which can be potentially produced in...,,SAP,unece.unit.KGM,unece.unit.CMQ,unece.unit.KGM
3,4,9001731054 - fascia panel GV650A.UC Int White...,MAT_928324,Material,60100007114130.0,0.0,0.0,9001731054,R250 - Injected parts (general),,...,"G8A - FDG Neukirchner,Daniel",,8001186041.0,piece(s),Materials which can be potentially produced in...,,SAP,unece.unit.GRM,unece.unit.CMQ,unece.unit.GRM
4,5,9001762412 - Cable Harness operat. EE 590 SP....,MAT_928325,Material,60100008886908.0,26.8,0.0,9001762412,L592 - IDC+IDC jumpers,,...,"VBI - FCGB Schuhmacher, Jochen",,,piece(s),Materials which are externally procured,L 590MM,SAP,unece.unit.GRM,unece.unit.DMQ,unece.unit.GRM


In [58]:
# keep only the columns we deem necessary
df = df[['Name', 'AT_MaraMatkl', 'AT_MaraMaktx', 'AT_MaraBrgew', 'AT_MaraMtart', 'AT_MaraLabor']]

# merge all columns into one
df = df.astype(str).apply(lambda x: ' '.join(x), axis=1)

# remove all non-alphanumeric characters
df.replace(r'[^a-zA-Z]', ' ', regex=True, inplace=True)

# remove all single characters from one column dataframe
df = df.astype(str).apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

# remove uneccessary spaces
df = df.astype(str).apply(lambda x: ' '.join(x.split()))

# lower case
df = df.astype(str).apply(lambda x: x.lower())

# remove non alphanumeric characters
df.replace(r'[^a-zA-Z0-9]', ' ', regex=True, inplace=True)

# remove strings that contain numbers
df = df.astype(str).apply(lambda x: ' '.join([w for w in x.split() if not any(c.isdigit() for c in w)]))

# lower case
df = df.astype(str).apply(lambda x: x.lower())

# stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df = df.astype(str).apply(lambda x: ' '.join([stemmer.stem(w) for w in x.split()]))

# remove duplicate words in one row
df = df.astype(str).apply(lambda x: ' '.join(sorted(set(x.split()), key=x.split().index)))

In [59]:
df.head()


0    len head screw custom materi which are extern ...
1    cp bodi laser print wt hv fr xxx do not use an...
2    user ma se wt hv fr print materi gener which c...
3    fascia panel gv uc int white inject part gener...
4    cabl har operat ee sp idc jumper materi which ...
dtype: object

In [60]:
# create list from dataframe
df_list = df.values.tolist()
df_list[:10]

['len head screw custom materi which are extern procur fcgb inserra bortolo',
 'cp bodi laser print wt hv fr xxx do not use anymor materi which can be potenti produc intern wd flcd default fdw',
 'user ma se wt hv fr print materi gener which can be potenti produc intern wd flcd default fdw',
 'fascia panel gv uc int white inject part gener materi which can be potenti produc intern fdg neukirchn daniel',
 'cabl har operat ee sp idc jumper materi which are extern procur vbi fcgb schuhmach jochen',
 'cabl har operat ee mm sp idc jumper materi which are extern procur vbi fcgb schuhmach jochen',
 'wire yellow green dummi materi which can be potenti produc intern fcgl schmitt benoit',
 'wire yellow green dummi materi which can be potenti produc intern fcgl schmitt benoit',
 'wire yellow green dummi materi which can be potenti produc intern fcgl schmitt benoit',
 'dough hook mum compl spare part wire good materi which can be potenti produc intern cs fne default']

In [61]:
from bertopic import BERTopic

topic_model = BERTopic(language="multilingual") 
topics, probs = topic_model.fit_transform(df_list)

In [62]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,153,-1_dummi_which_materi_fne
1,0,44,0_steel_wire_stainless_made
2,1,44,1_sotlar_saso_mmr_basic
3,2,41,2_hing_dnak_kj_top
4,3,35,3_print_flcn_wax_klosn
5,4,32,4_box_corrug_fold_board
6,5,31,5_slider_anton_kriznik_gener
7,6,28,6_paint_coil_sheet_coat
8,7,28,7_electron_unit_incl_em
9,8,26,8_chopper_subcontract_assembl_saso


In [11]:
# topic_model.get_document_info(df_list)[:10]

In [10]:
# install Jupyter Notebook Renderers if "No renderer could be found for mimetype application/vnd.plotly.v1+json"
# topic_model.visualize_topics()

In [63]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(df_list, linkage_function=linkage_function)

100%|██████████| 38/38 [00:00<00:00, 173.58it/s]


In [64]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)