In [1]:
# base data sci libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# image processing libraries
from PIL import Image

np.random.seed(23)
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150

from utils.feature_utils import create_visual_vocab, extract_bovw_features, extract_hog_features, extract_wavelet_features, extract_log_features, extract_normals_features, extract_gabor_features


### To-Dos
* Finalize feature selections & tune features
* Once we have features, tweak and play with the grid-search CV to get some baseline results for our model
* tSNE
* Update Pickle once tune features ? 


### Done
* Add feature functions into cell with extract_log_features() mimicking the format and the instructions there
* Update parse_data() to include any features generated from the functions defined in the featurize block
* Add PCA and any other data exploration we want to complete for feature selection
* Decide on strategy in terms of flattening images or using scalar features -- what is worth spending computational umph on based on data exploration? - NO ?
* Run PCA on existing features 
* Pickle the feature df


Run the below parser to produce the pickle file, which parses the raw image data and generates several arrays. These arrays are then turned into a DF and sent to a pickle file down below.

### Parse Data to DF + Add In Features

In [None]:
def parse_all(folder_path):
    """ 
    Run all feature functions and create df of all feature representations. Scalar or Vector 
    
    Returns: 
        X (np.array): Feature values 
        Y (np.array): Categorical label for each image  
        unique_ids (np.array): image IDs  
    """

    image_vectors = []  # image data
    labels = []  # labels
    ids = []  # unique IDs
    features = []

    kmeans_fitted = create_visual_vocab(folder_path)

    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            parts = filename.split('_')
            fabType = parts[0]
            id1 = parts[1]
            id2 = parts[3].split('.')[0]  # Remove .png extension
            
            unique_id = id1 + id2

            image = Image.open(os.path.join(folder_path, filename)).convert('L')
            img_array = np.array(image)

            # sift expects a particular type of image so this needs to be done before 
            # the image is normalized for the other features. The output vector is between
            # 0 and 1 and should not impact PCA
            bovw_feature_vector = extract_bovw_features(img_array, kmeans_fitted)
            
            # normalize the image vector to be between 0 and 1 
            img_array_std = (img_array - np.mean(img_array)) / np.std(img_array)

            ### SCALAR FEATURES ###
            hog_stats = extract_hog_features(img_array_std, 4, 20)
            hog_mean, hog_sum, hog_var, hog_skew, hog_kurt = hog_stats[:5]
            wavelet_stats = extract_wavelet_features(img_array_std)
            wave_mean_cA, wave_var_cA, wave_mean_cD, wave_var_cD = wavelet_stats[:4]
            # bovw_features = extract_bovw_features(img_array_std, 5, False)
            # bovw_1, bovw_2, bovw_3, bovw_4, bovw_5 = bovw_features[:5] 
            
            scalar_features = []
            scalar_features.extend([
                extract_log_features(img_array_std), 
                extract_normals_features(img_array_std), 
                extract_gabor_features(img_array_std),
                hog_mean, hog_sum, hog_var, hog_skew, hog_kurt, 
                wave_mean_cA, wave_var_cA, wave_mean_cD, wave_var_cD, 
                *bovw_feature_vector])
        
            scalar_features_array = np.array(scalar_features)

            """
            
            ### VECTORIZED FEATURES ###
            # new_vector_with_scalar = np.append(img_vector_normalized, log_scalar)

            # log_vector = extract_log_features(img_array, scalar=False)
            # hog_vector = extract_hog_features(img_array, 4, 20, scalar=False)
            # normals_vector = extract_normals_features(img_array, scalar = False)
            
            #NOTE: This is created but then not used, I added as an additional output (features)  
            # final_img_feature_vector  = np.concatenate((img_vector_normalized, scalar_features_array, log_vector, hog_vector, normals_vector)) # BE SURE TO ADD ANY FEATURE VECTORS HERE

            # final_img_feature_vector  = np.concatenate((img_vector_normalized, scalar_features_array)) # flattened image vector + scalar features
            # image_vectors.append(img_vector_normalized)
            # features.append(final_img_feature_vector)
                 
            """

            image_vectors.append(scalar_features_array)
            labels.append(fabType)
            ids.append(unique_id)

    X = np.array(image_vectors)
    Y = np.array(labels)
    unique_ids = np.array(ids)
    # features_array = np.array(features)
    # return X, Y, unique_ids, features_array
    return X, Y, unique_ids

In [None]:
# Driver for PARSE ALL 
folder_path = './Subsamples/train'
# X, Y, unique_ids, features_array = parse_all(folder_path)
X, Y, unique_ids = parse_all(folder_path)

df = pd.DataFrame(X)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

### 🥒 PICKLE🚰📈

In [4]:
## Pickle the dataframe
train_df = df.copy()
train_df.columns = train_df.columns.astype(str)
PIXEL_COLS = train_df.columns.tolist()[:-3] # list of pixel header
LABEL_COLS = ['label', 'category'] # list of labels header
cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df.head()

curr_date = '04dd' # Replace dd with date 
filename = f'./Subsamples/train_{curr_date}.pkl'
train_df.to_pickle(filename)

In [5]:
# Open the pickled df  
filename = './pkls/train_0406.pkl'
train_df = pd.read_pickle(filename)
train_df.shape

(22128, 34)