# Create Scalar Feature Dataset + Send to Pickle

The following script runs and creates a scalar feature dataset for all of the training data and testing data using the feature functions defined in the feature_utils file. We used this approach as then we could take advantage of the strength of the strongest laptop on the team to run the processing and featurization. Once saved as a pickle file, which is a compressed binary representation of a pandas dataframe, it was very easy for other team members to import the pickle and run inference without having to process the data every time. This approach allowed us to parallelize efforts to explore models with confidence, as we knew we would be operating on the same processed training and testing datasets.

In [4]:
# base data sci libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# image processing libraries
from PIL import Image

np.random.seed(23)
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150

from utils.feature_utils import create_visual_vocab, extract_bovw_features, extract_hog_features, extract_wavelet_features, extract_log_features, extract_normals_features, extract_gabor_features


Run the below parser to produce the pickle file, which parses the raw image data and generates several arrays. These arrays are then turned into a DF and sent to a pickle file down below.

### Parse Data to DF + Add In Features

In [13]:
def parse_all(folder_path, pretrained_kmeans=None):
    """ 
    Run all feature functions and create df of all feature representations. Scalar or Vector 
    
    Returns: 
        X (np.array): Feature values 
        Y (np.array): Categorical label for each image  
        unique_ids (np.array): image IDs  
    """

    image_vectors = []  # image data
    labels = []  # labels
    ids = []  # unique IDs

    if pretrained_kmeans:
        kmeans_fitted = pretrained_kmeans
    else:
        kmeans_fitted = create_visual_vocab(folder_path)

    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            parts = filename.split('_')
            fabType = parts[0]
            id1 = parts[1]
            id2 = parts[3].split('.')[0]  # Remove .png extension
            
            unique_id = id1 + id2

            image = Image.open(os.path.join(folder_path, filename)).convert('L')
            img_array = np.array(image)

            # sift expects a particular type of image so this needs to be done before 
            # the image is normalized for the other features. The output vector is between
            # 0 and 1 and should not impact PCA
            bovw_feature_vector = extract_bovw_features(img_array, kmeans_fitted)
            
            # normalize the image vector to be between 0 and 1 
            img_array_std = (img_array - np.mean(img_array)) / np.std(img_array)

            ### SCALAR FEATURES ###
            hog_stats = extract_hog_features(img_array_std, 4, 20)
            hog_mean, hog_sum, hog_var, hog_skew, hog_kurt = hog_stats[:5]
            wavelet_stats = extract_wavelet_features(img_array_std)
            wave_mean_cA, wave_var_cA, wave_mean_cD, wave_var_cD = wavelet_stats[:4]
            # bovw_features = extract_bovw_features(img_array_std, 5, False)
            # bovw_1, bovw_2, bovw_3, bovw_4, bovw_5 = bovw_features[:5] 
            
            scalar_features = []
            scalar_features.extend([
                extract_log_features(img_array_std), 
                extract_normals_features(img_array_std), 
                extract_gabor_features(img_array_std),
                hog_mean, hog_sum, hog_var, hog_skew, hog_kurt, 
                wave_mean_cA, wave_var_cA, wave_mean_cD, wave_var_cD, 
                *bovw_feature_vector])
        
            scalar_features_array = np.array(scalar_features)

            image_vectors.append(scalar_features_array)
            labels.append(fabType)
            ids.append(unique_id)

    X = np.array(image_vectors)
    Y = np.array(labels)
    unique_ids = np.array(ids)
    return X, Y, unique_ids, kmeans_fitted

# Create training dataframe pickled for easy future access

In [14]:
# Driver for PARSE ALL 
folder_path = './Subsamples/train'
# X, Y, unique_ids, features_array = parse_all(folder_path)
X, Y, unique_ids, kmeans_fitted_train = parse_all(folder_path)

df = pd.DataFrame(X)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,category,label,uid
0,3441.793288,20285310.0,1844.05677,0.150226,346.120911,0.00521,9.3e-05,6.3e-05,2.273737e-16,3.959789,...,0.039062,0.03125,0.027344,0.046875,0.0625,0.050781,0.050781,Blended,0,8821c
1,7211.992783,7747671.0,2271.840993,0.151475,348.99826,0.004833,-6.7e-05,3.8e-05,2.046363e-16,3.86456,...,0.024324,0.013514,0.021622,0.024324,0.043243,0.016216,0.07027,Denim,1,1503c
2,8856.756862,5854463.0,1967.259618,0.160454,369.684998,0.002032,-2e-06,8e-06,9.094947000000001e-17,3.812086,...,0.25,0.12963,0.194444,0.018519,0.00463,0.027778,0.00463,Polyester,2,16132c
3,7018.112788,7817569.0,1953.124972,0.1523,350.8992,0.004582,1.1e-05,3.4e-05,-6.821210000000001e-17,3.879019,...,0.007463,0.022388,0.044776,0.085821,0.074627,0.033582,0.029851,Blended,0,3621d
4,7932.263905,6971318.0,2053.412469,0.157971,363.965454,0.002823,-3e-06,1.7e-05,-4.547474e-16,3.838197,...,0.047619,0.05102,0.081633,0.013605,0.044218,0.047619,0.054422,Cotton,3,2333a


### ðŸ¥’ PICKLEðŸš°ðŸ“ˆ

In [None]:
## Pickle the dataframe
train_df = df.copy()
train_df.columns = train_df.columns.astype(str)
PIXEL_COLS = train_df.columns.tolist()[:-3] # list of pixel header
LABEL_COLS = ['label', 'category'] # list of labels header
cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df.head()

curr_date = '04dd' # Replace dd with date 
filename = f'./Subsamples/train_{curr_date}.pkl'
train_df.to_pickle(filename)

In [None]:
# Open the pickled df  
filename = './pkls/train_0406.pkl'
train_df = pd.read_pickle(filename)
train_df.shape

# Create test dataframe pickled for easy future access

In [15]:
# Driver for PARSE ALL 
folder_path = './Subsamples/test'
# X, Y, unique_ids, features_array = parse_all(folder_path)
X, Y, unique_ids, _ = parse_all(folder_path, pretrained_kmeans=kmeans_fitted_train)

df = pd.DataFrame(X)
df['category'] = pd.Categorical(Y)

# mapping dictionary
category_to_label = {'Blended': 0, 'Denim': 1, 'Polyester': 2, 'Cotton': 3, 'Wool': 4}

# map the 'category' to 'label' using the dictionary defined by train_0406_scalar_non-aug.pkl
df['label'] = df['category'].map(category_to_label)

df['uid'] = unique_ids

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,category,label,uid
0,8401.258729,9171677.0,2193.809984,0.159283,366.988983,0.002407,-3e-05,1.1e-05,1.080025e-16,3.784362,...,0.078704,0.083333,0.069444,0.074074,0.083333,0.023148,0.013889,Polyester,2,6293d
1,11702.34047,4712472.0,2154.897678,0.16169,372.533752,0.001634,-1.4e-05,5e-06,1.284661e-15,3.697774,...,0.269565,0.07971,0.175362,0.024638,0.010145,0.044928,0.0,Cotton,3,1804b
2,7195.413672,12099860.0,1934.615054,0.151225,348.423218,0.004909,2.1e-05,4.3e-05,-2.046363e-16,3.843481,...,0.012821,0.00641,0.012821,0.064103,0.089744,0.070513,0.070513,Blended,0,8933d
3,4753.699494,10038200.0,1746.407531,0.152653,351.713531,0.004475,0.000101,3.8e-05,-1.136868e-16,3.930521,...,0.019108,0.019108,0.0,0.063694,0.050955,0.101911,0.10828,Cotton,3,12133c
4,9354.039506,6956389.0,2133.356656,0.161588,372.299194,0.001667,-2.7e-05,6e-06,3.524292e-16,3.825473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cotton,3,17161a


### ðŸ¥’ PICKLEðŸš°ðŸ“ˆ

In [16]:
## Pickle the dataframe
test_df = df.copy()
test_df.columns = test_df.columns.astype(str)
PIXEL_COLS = test_df.columns.tolist()[:-3] # list of pixel header
LABEL_COLS = ['label', 'category'] # list of labels header
cols_reorder = LABEL_COLS + PIXEL_COLS
test_df = test_df[cols_reorder]
test_df.head()

curr_date = '0415' # Replace dd with date 
filename = f'./pkls/test_{curr_date}.pkl'
test_df.to_pickle(filename)

In [17]:
filename = './pkls/test_0415.pkl'
train_df = pd.read_pickle(filename)
train_df.shape

(4729, 34)