### Data Preparation for deepweeds

This notebook is for loading data into local machines. This notebook assumes that the deeweeds data has been downloaded and saved in the "data" folder". Download is available at https://github.com/geraldmc/torch-draft-final_project/archive/refs/heads/main.zip. After unzipping, the data folder must have the following:

- labels (folder containing all the labels for the raw images)
- images (folder containing all the raw images)



In [1]:
#imports
import os
import pandas as pd
import glob 

In [2]:
#import the config files

# Import local libs in a try block
try:
    import conf.params as params
except ImportError:
    pass

In [5]:
#read in the labels files
LABEL_PATH = os.path.join(params.DATA_PATH, 'labels')
label_df = pd.read_csv(os.path.join(LABEL_PATH, 'labels.csv'))
label_df.head()

Unnamed: 0,Filename,Label,Species
0,20160928-140314-0.jpg,0,Chinee apple
1,20160928-140337-0.jpg,0,Chinee apple
2,20160928-140731-0.jpg,0,Chinee apple
3,20160928-140747-0.jpg,0,Chinee apple
4,20160928-141107-0.jpg,0,Chinee apple


In [6]:
#Combine all train, test, val files, and random sample from the combined dataframes.

joined_val = os.path.join("data/", "labels/", "val*.csv")
joined_train = os.path.join("data/", "labels/", "train*.csv")
joined_test = os.path.join("data/", "labels/", "test*.csv")

val_files = glob.glob(joined_val)
train_files = glob.glob(joined_train)
test_files = glob.glob(joined_test)

train_df = pd.concat(map(pd.read_csv, train_files), ignore_index=True)
val_df = pd.concat(map(pd.read_csv, val_files), ignore_index=True)
test_df = pd.concat(map(pd.read_csv, test_files), ignore_index=True)

# # In the paper, each fold contains 10,505 samples from the total 
sample_train_df = train_df#train_df.sample(n=10505)
sample_val_df = val_df#val_df.sample(n=3502)
sample_test_df = test_df#test_df.sample(n=3502)
sample_train_df.head()

Unnamed: 0,Filename,Label
0,20171109-175921-2.jpg,5
1,20170714-142019-3.jpg,1
2,20170718-101402-2.jpg,0
3,20170126-095456-0.jpg,1
4,20170913-110647-1.jpg,3


In [12]:
#load the training data
import shutil

files = []
for dirpath, dirnames, filenames in os.walk(params.IMAGE_PATH):
    for file in filenames:
        files.append(file)

def copy_files(df, filepath, test=False):
    labels = dict(zip(df.Filename, df.Label))
    if not test:
        for f in files:
            try:
                src = os.path.join(params.IMG_DIRECTORY, f)
                dst = os.path.join(filepath, str(labels[f]), f)
                shutil.copyfile(src, dst)
            except KeyError:
                pass
    else:
        for f in files:
            try:
                src = os.path.join(params.IMG_DIRECTORY, f)
                dst = os.path.join(filepath, f)
                shutil.copyfile(src, dst)
            except KeyError:
                pass

copy_files(sample_train_df, params.IMG_TRAIN_PATH)
copy_files(sample_val_df, params.IMG_VAL_PATH, False)
copy_files(sample_test_df, params.IMG_TEST_PATH, True)

In [14]:
test_df["Label"].value_counts()

8    9106
0    1126
6    1074
1    1063
4    1062
2    1031
3    1022
7    1016
5    1009
Name: Label, dtype: int64