In [1]:
import os, os.path
import pandas as pd
import json

In [2]:
def get_names(path):
    return [name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))]

In [3]:
def process_df(df, clean_histograms):
    clean_df = df[df.model.isin(clean_histograms)] 
    serie = pd.DataFrame(clean_df.type.apply(lambda x: x.split('.')[0]))
    clean_df.pop('type')
    return pd.concat([clean_df, serie], axis=1)

In [4]:
models_path = 'models'
histograms_path = 'histograms'

In [5]:
models_names = get_names(models_path)
models_names[:10]

['6f284edb-aad7-4177-8b34-47558216c76f.stl',
 'a4aa7936-4c4e-4a56-bcaa-242ec4c876c8.stl',
 'b8cc7127-26c1-48a6-a980-ffd31b8c6442.stl',
 '39ecf69c-0139-4367-99d3-9d52d104236a.stl',
 '7ddfa22c-b938-46ef-9e22-f55b72c680ac.stl',
 '703d1136-08d1-4adf-bae1-6d47f1b86e1b.stl',
 '88433f18-380d-4631-b349-7f4a55e3e1eb.stl',
 '0e5c131d-0824-408b-b93a-4dcaa40c2c0a.stl',
 '23d777c5-0efd-4303-be04-d83761550eee.stl',
 'f8a74137-feee-437f-bae6-5d4831111eb2.stl']

In [6]:
histograms_names = get_names(histograms_path)
histograms_names[:10]

['118f6321-4e9c-42d6-837a-9e713a3a4c88.json',
 '1c345b16-d820-4b3d-b68b-98b6d6a314c0.json',
 '1fc8b261-6415-40a2-8559-3b92ed284036.json',
 '2b63d944-6efb-4907-9245-0dd6147f3338.json',
 '170c33c2-946f-4b1b-ac60-abecf524203a.json',
 '09469744-9517-46b0-bc5b-b0e4f1b0a3be.json',
 '071c7c20-4ab2-441d-bc67-c07e6b1ccf94.json',
 '11f99e6d-4219-4068-a76c-925a6d3ee8c4.json',
 '2c38df05-05d5-42a6-b5b5-2469b038b270.json',
 '09470f5e-b76f-4c10-bc81-1683e1b5134e.json']

In [7]:
clean_histograms = [name.split('.')[0] for name in histograms_names]
clean_histograms[:10]

['118f6321-4e9c-42d6-837a-9e713a3a4c88',
 '1c345b16-d820-4b3d-b68b-98b6d6a314c0',
 '1fc8b261-6415-40a2-8559-3b92ed284036',
 '2b63d944-6efb-4907-9245-0dd6147f3338',
 '170c33c2-946f-4b1b-ac60-abecf524203a',
 '09469744-9517-46b0-bc5b-b0e4f1b0a3be',
 '071c7c20-4ab2-441d-bc67-c07e6b1ccf94',
 '11f99e6d-4219-4068-a76c-925a6d3ee8c4',
 '2c38df05-05d5-42a6-b5b5-2469b038b270',
 '09470f5e-b76f-4c10-bc81-1683e1b5134e']

In [8]:
choosen_models = [name for name in models_names if name.split('.')[0] in clean_histograms]
choosen_models[:10]

['0e5c131d-0824-408b-b93a-4dcaa40c2c0a.stl',
 '23d777c5-0efd-4303-be04-d83761550eee.stl',
 '319a8dd3-dc5d-4516-b3f0-06da9ddd4908.stl',
 '0b72c5a6-58e4-48ed-aafc-082186b23a8e.stl',
 '2039b446-a596-4bcd-80c9-f7a2b34dedb0.stl',
 '10e288de-0298-4e75-97a3-5ab5ae53d949.stl',
 '09470f5e-b76f-4c10-bc81-1683e1b5134e.stl',
 '1616f21b-03c4-403c-8523-c6fa0cc669b5.stl',
 '06b82c04-f6f4-4e86-af01-6c0783cf34e6.stl',
 '038658d9-dd86-4737-bb36-6d3c94044333.stl']

In [9]:
len(choosen_models)

2137

#### clean 

In [12]:
!pwd

/home/galdmitry/CODE_REPOS/HSE_prod_stories/HW_04/D


In [10]:
def remove_models(path, choosen_models):
    files =  os.listdir(path)
    print(f"files before removing: {len(files)}")
    
    for name in files:
        if name not in choosen_models:
            os.remove(f"{path}/{name}")
            
    print(f"files after removing: {len(os.listdir(path))}")

In [13]:
remove_models("models_clean", choosen_models)

files before removing: 2137
files after removing: 2137


In [14]:
df = pd.read_csv("labels.csv")
clean_df = process_df(df, clean_histograms)
clean_df.head()

Unnamed: 0,model,type
1,30a730ca-c466-4e8c-b00b-bf7b317495e2,Cone
7,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere
12,347130e7-e4ad-4714-a322-dadd515433f7,Torus
17,1346fe51-b6f1-4036-a2c4-cdf84abf6c85,Cone
19,1715abb8-f4e4-4eb3-a030-6c5b0083f1f0,Cylinder


In [15]:
clean_df

Unnamed: 0,model,type
1,30a730ca-c466-4e8c-b00b-bf7b317495e2,Cone
7,1b220e0b-d7fa-40a0-8cfd-930a3228c668,Sphere
12,347130e7-e4ad-4714-a322-dadd515433f7,Torus
17,1346fe51-b6f1-4036-a2c4-cdf84abf6c85,Cone
19,1715abb8-f4e4-4eb3-a030-6c5b0083f1f0,Cylinder
...,...,...
9972,136a3a19-9b09-4e4b-8841-facf47b9abc5,Torus
9989,1e220704-e5d5-4596-864a-2fd3ab48fcbe,Sphere
9991,14ae3c6a-867e-4814-821b-1993979f4a23,Sphere
9992,222bdb7d-32e4-4499-988f-b91d0c7018c3,Cone


In [16]:
clean_df.type.unique()

array(['Cone', 'Sphere', 'Torus', 'Cylinder', 'Cube'], dtype=object)

In [19]:
clean_df.to_csv("labels_clean.csv", index=False)