In [90]:
import os
import pandas as pd
import shutil

from tqdm import tqdm
from shutil import copyfile as copy

df = pd.read_csv('Final Stock Images.csv')

df.head()

Unnamed: 0,Image Link,Tags,Likes,Comments,Path
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55,Images/clouds-7050884__480.jpg
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20,Images/bird-7117346__340.jpg
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106,Images/sea-7039471__340.jpg
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11,Images/cherry-blossoms-7110279__340.jpg
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15,Images/cape-marguerite-7121992__340.jpg


## Removing Unwanted Columns

In [21]:
del df['Image Link']
del df['Comments']
del df['Likes']

In [22]:
df.head()

Unnamed: 0,Tags,Path
0,"Clouds, Sky, Atmosphere, Blue Sky",Images/clouds-7050884__480.jpg
1,"Bird, Ornithology, Hummingbird",Images/bird-7117346__340.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Images/sea-7039471__340.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Images/cherry-blossoms-7110279__340.jpg
4,"Cape Marguerite, Flower, Plant",Images/cape-marguerite-7121992__340.jpg


## Finding all the tags

In [23]:
tags = []

for t in df['Tags']:
    tags += [tag.strip() for tag in t.split(',')]
    
unique_tags = list(set(tags))

## Creating Folders for each Tag

In [24]:
for tag in tqdm(unique_tags):
    
    try:
        os.mkdir('Dataset/' + tag)
    except:
        pass

100%|███████████████████████████████████████████████████████████████████████████| 8015/8015 [00:00<00:00, 24220.81it/s]


## Saving the Images in their respective tags & Creating Dataset

In [25]:
error = 0

for data in tqdm(df.values):
    
    tags = data[0]
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]
    
    src = data[1]
    
    for i in tags:
        dst = i + src.split('/')[-1]
        
        try:
            copy(src, dst)
        except:
            error += 1

100%|██████████████████████████████████████████████████████████████████████████████| 9104/9104 [03:23<00:00, 44.70it/s]


## Checking the folders

In [28]:
folders = os.listdir('Dataset')

len(folders)

7964

In [44]:
folder_ = []
freq = []

for folder in folders:
    
    try:
        freq.append(len(os.listdir('Dataset/' + folder)))
        folder_.append(folder)
    except:
        pass

In [48]:
df_2 = pd.DataFrame()

df_2['Folders']   = folder_
df_2['Frequency'] = freq

In [49]:
df_2.head()

Unnamed: 0,Folders,Frequency
0,&quot;All we are saying is give peace a chance...,1
1,&quot;Gray wagtail&quot; found in rivers and m...,1
2,1891. In 1906 she was en route from Peru to Ge...,1
3,1925,1
4,1950S,1


## Top 10 Folders with most Images

In [56]:
df_2.sort_values(by = 'Frequency', ascending = False).head(10)

Unnamed: 0,Folders,Frequency
4744,nature,838
224,Animal,608
2742,Flower,548
725,Bird,517
2758,Flowers,505
6094,Sea,305
5346,Plant,283
6775,Sunset,275
2809,Forest,261
7222,Trees,242


## Top 10 Folders with least Images

In [57]:
df_2.sort_values(by = 'Frequency', ascending = True).head(10)

Unnamed: 0,Folders,Frequency
2385,emergiendo,0
6471,South Australia. The light housekeeper sold ve...,0
2046,Der Sonne entgegen,0
6977,The Camellia japonica is a very common tree sp...,0
2002,Das versteckte Haus,0
2557,Fantasmagòrica vista de los Reyes Magos de Ori...,0
7037,This shot was taken at Kamo Aquarium in Yamaga...,0
423,Australia The Windorah solar dishes supply 100...,0
2280,Each candle is a soul,0
3433,High in the French alps. Climbers walk the ste...,0


## Folders with more than 10 Images

In [70]:
df_2[df_2['Frequency'] >= 10]

Unnamed: 0,Folders,Frequency
22,A Book,12
44,Abstract,38
74,Add a description (optional),14
80,Adult,13
83,Advent,11
...,...,...
7877,Yellow,21
7886,Yellow Flower,37
7887,Yellow Flowers,21
7914,Young,23


## Removing folders with less than 10 Images

In [91]:
for folder in tqdm(df_2[df_2['Frequency'] < 10]['Folders']):
    
    src = 'Dataset/' + folder
    dst = 'Temp/' + folder
    
    shutil.move(src, dst)

100%|█████████████████████████████████████████████████████████████████████████████| 7353/7353 [00:10<00:00, 731.69it/s]
