In [57]:
import os
import sys
import glob

# data prep tools
import pandas as pd
import pickle
import sklearn

In [103]:
DATA_DIR = '/home/saa/saa-workspaces/dogbreed-kaggle/data/'

DOGBREED_LABELPATH = os.path.join(DATA_DIR, 'dogbreed/raw/labels.csv')
DOGBREED_LABELPAIRPATH = os.path.join(DATA_DIR, 'dogbreed/raw/labelpairs.csv')
DOGBREED_ENCLABELPATH = os.path.join(DATA_DIR, 'dogbreed/raw/labels_enc.csv')

DOGBREED_SPLITDIR = os.path.join(DATA_DIR, 'dogbreed/splits/')

DOGBREED_TRAINDIR = os.path.join(DATA_DIR, 'dogbreed/raw/train/')

In [123]:
labels_df = pd.read_csv(DOGBREED_LABELPATH, sep=',')
labels_df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [115]:
def id_to_path(datadir, labels_file, id_colname, outpath):
    # load labels from labels file
    labels_df = pd.read_csv(labels_file)
    labels_df[id_colname] =  labels_df[id_colname].apply(lambda idname: os.path.join(datadir, idname))
    labels_df.rename(columns={id_colname:'paths'}, inplace=True)
    labels_df.to_csv(outpath, index=False)

In [116]:
id_to_path(DOGBREED_TRAINDIR, DOGBREED_LABELPATH, 'id', DOGBREED_LABELPAIRPATH)

In [117]:
from sklearn.preprocessing import LabelEncoder

def num_encoding_labels_file(labels_file, label_name, outpath):
    # load labels from labels file
    labels_df = pd.read_csv(labels_file)
    label_col = labels_df[label_name]
    
    # encode to number 
    lbl_encoder = LabelEncoder()
    lbl_encoder.fit(label_col)
    mapping =  dict(zip(lbl_encoder.classes_, lbl_encoder.transform(lbl_encoder.classes_)))
    labels_df["labels"] = lbl_encoder.transform(label_col)

    # write to csv file
    labels_df.to_csv(outpath, index=False)
    
    # write to pickle
    outpath, ext = os.path.splitext(outpath)
    picklefile = "%s_map.pickle" % outpath
    print(picklefile)
    with open(picklefile, 'wb') as fp:
        pickle.dump(mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return labels_df, mapping

In [118]:
enclbl_df, mapping = num_encoding_labels_file(DOGBREED_LABELPAIRPATH, "breed", DOGBREED_ENCLABELPATH)

/home/saa/saa-workspaces/dogbreed-kaggle/data/dogbreed/raw/labels_enc_map.pickle


In [124]:
def _skf_path_labels(all_paths, all_labels, outdir, out_prefix="", n_splits=5):
    # stratify dataset
    skf = StratifiedKFold(n_splits=n_splits)
    for i, (train_idx, test_idx) in enumerate(skf.split(all_paths, all_labels)):
        X_train = all_paths[train_idx] # X_train is list of train data path 
        y_train = all_labels[train_idx] # y_train is list of label values

        # path for text of train path list
        train_prefix = "{}train_split_{}.txt".format(out_prefix, i)
        train_filepath = os.path.join(outdir, train_prefix) 

        with open(train_filepath, 'w') as fp:
            for filepath, label in zip(X_train, y_train):
                fp.write("{} {}\n".format(filepath, label))
        
        X_test = all_paths[test_idx] # X_test is list of train data path
        y_test = all_labels[test_idx] # y_test is list of train data path

        # path for text of test path list
        test_prefix = "{}test_split_{}.txt".format(out_prefix, i)
        test_filepath = os.path.join(outdir, test_prefix) 

        with open(test_filepath, 'w') as fp:
            for filepath, label in zip(X_test, y_test):
                fp.write("{} {}\n".format(filepath, label))

In [120]:
def split_path_label_pairs(labels_file, path_colname, label_name, outdir, out_prefix="", separator=",", has_header=True, n_splits=5):
    """Split path-label pairs to k folds of train and test set.
    This will take labels file and generate k text files each contains lists of data paths.
    Labels file should contains file name (data) and label (2 columns) in each line, seperated by separator.

    Arguments:
        - labels_file (str): filename/path for labels file
        - outdir (str): specify dir for saving skf text files
        - out_prefix (str): if provided, will prefix the text filename with it
        - separator (char): specify separator used in labels_file
        - has_header (bool): whether the 1st row in labels_file is a header
        - n_splits (int): number of splits (k)
    """
    labels_df = pd.read_csv(labels_file)
    all_paths = labels_df[path_colname].to_numpy()
    all_labels = labels_df[label_name].to_numpy()
    
    _skf_path_labels(all_paths, all_labels, outdir)

In [125]:
split_path_label_pairs(DOGBREED_ENCLABELPATH, 'paths', 'labels', DOGBREED_SPLITDIR)