In [1]:
# base data sci libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# image processing libraries
from PIL import Image

np.random.seed(23)
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150

from utils.feature_utils import create_visual_vocab, extract_bovw_features, extract_hog_features, extract_wavelet_features, extract_log_features, extract_normals_features, extract_gabor_features

Run the below parser to produce the pickle file, which parses the raw image data and generates several arrays. These arrays are then turned into a DF and sent to a pickle file down below.

### Parse Data to DF + Add In Features

In [2]:
def parse_all(folder_path):
    """ 
    Run all feature functions and create df of all feature representations. Scalar or Vector 
    
    Returns: 
        X (np.array): Feature values 
        Y (np.array): Categorical label for each image  
        unique_ids (np.array): image IDs  
    """

    image_vectors = []  # image data
    labels = []  # labels
    ids = []  # unique IDs
    features = []

    kmeans_fitted = create_visual_vocab(folder_path)

    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            parts = filename.split('_')
            fabType = parts[0]
            id1 = parts[1]
            id2 = parts[3].split('.')[0]  # Remove .png extension
            
            unique_id = id1 + id2

            image = Image.open(os.path.join(folder_path, filename)).convert('L')
            img_array = np.array(image)

            # sift expects a particular type of image so this needs to be done before 
            # the image is normalized for the other features. The output vector is between
            # 0 and 1 and should not impact PCA
            bovw_feature_vector = extract_bovw_features(img_array, kmeans_fitted)
            
            # normalize the image vector to be between 0 and 1 
            img_array_std = (img_array - np.mean(img_array)) / np.std(img_array)
            hog_feature_vector = extract_hog_features(img_array_std, 4, 20, scalar=False)
            normal_feature_vector = extract_normals_features(img_array_std, scalar=False)
            
            scalar_features = []
            scalar_features.extend([

                *hog_feature_vector,
                *normal_feature_vector,
                *bovw_feature_vector])
        
            scalar_features_array = np.array(scalar_features)

            image_vectors.append(scalar_features_array)
            labels.append(fabType)
            ids.append(unique_id)

    X = np.array(image_vectors)
    Y = np.array(labels)
    unique_ids = np.array(ids)
    return X, Y, unique_ids

In [3]:
# Driver for PARSE ALL 
folder_path = './Subsamples/train'

X, Y, unique_ids = parse_all(folder_path)

In [9]:
df = pd.DataFrame(X)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30337,30338,30339,30340,30341,30342,30343,category,label,uid
0,0.215534,0.126553,0.142203,0.178407,0.215534,0.125333,0.155869,0.215534,0.12371,0.093083,...,0.06962,0.044304,0.075949,0.012658,0.050633,0.006329,0.0,Polyester,0,6293d
1,0.153295,0.165601,0.188585,0.166423,0.170281,0.196318,0.172008,0.131015,0.170395,0.18394,...,0.258182,0.243636,0.001818,0.02,0.050909,0.021818,0.0,Cotton,1,1804b
2,0.123603,0.145845,0.231658,0.163993,0.231658,0.140786,0.116232,0.216713,0.125419,0.086949,...,0.012658,0.0,0.151899,0.050633,0.0,0.0,0.101266,Blended,2,8933d
3,0.241904,0.141568,0.105752,0.103117,0.241904,0.122371,0.14557,0.106634,0.241904,0.119919,...,0.0,0.0,0.132867,0.034965,0.0,0.006993,0.216783,Cotton,1,12133c
4,0.176546,0.098107,0.195045,0.139884,0.124234,0.107477,0.15872,0.125503,0.209508,0.115671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cotton,1,17161a


In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA without specifying the number of components to retain all components
pca_full = PCA()
pca_full.fit(X_scaled)

# Calculate the cumulative explained variance ratio
cumulative_explained_variance_ratio = np.cumsum(pca_full.explained_variance_ratio_)

# Determine the number of components needed to explain at least 90% of the variance
n_components_90 = np.where(cumulative_explained_variance_ratio >= 0.9)[0][0] + 1

# Now apply PCA with the determined number of components
pca_reduced = PCA(n_components=n_components_90)
X_reduced = pca_reduced.fit_transform(X_scaled)

X_reduced.shape, n_components_90, cumulative_explained_variance_ratio


((14399, 5658),
 5658,
 array([0.0349456 , 0.04022113, 0.04489398, ..., 1.        , 1.        ,
        1.        ]))

In [5]:
df = pd.DataFrame(X_reduced)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

Unnamed: 0,label,category,0,1,2,3,4,5,6,7,...,30334,30335,30336,30337,30338,30339,30340,30341,30342,30343
0,0,Polyester,0.215534,0.126553,0.142203,0.178407,0.215534,0.125333,0.155869,0.215534,...,0.006329,0.101266,0.082278,0.06962,0.044304,0.075949,0.012658,0.050633,0.006329,0.0
1,2,Cotton,0.153295,0.165601,0.188585,0.166423,0.170281,0.196318,0.172008,0.131015,...,0.003636,0.007273,0.007273,0.258182,0.243636,0.001818,0.02,0.050909,0.021818,0.0
2,1,Blended,0.123603,0.145845,0.231658,0.163993,0.231658,0.140786,0.116232,0.216713,...,0.139241,0.063291,0.177215,0.012658,0.0,0.151899,0.050633,0.0,0.0,0.101266
3,2,Cotton,0.241904,0.141568,0.105752,0.103117,0.241904,0.122371,0.14557,0.106634,...,0.118881,0.0,0.041958,0.0,0.0,0.132867,0.034965,0.0,0.006993,0.216783
4,2,Cotton,0.176546,0.098107,0.195045,0.139884,0.124234,0.107477,0.15872,0.125503,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,Blended,0.12879,0.172703,0.160168,0.130952,0.157729,0.168377,0.174795,0.110678,...,0.01087,0.163043,0.021739,0.130435,0.076087,0.054348,0.043478,0.108696,0.021739,0.0
6,1,Blended,0.065967,0.063435,0.237091,0.204297,0.06326,0.082666,0.237091,0.195856,...,0.059072,0.0,0.059072,0.067511,0.063291,0.042194,0.029536,0.088608,0.054852,0.016878
7,1,Blended,0.111826,0.208331,0.208331,0.137737,0.10716,0.188954,0.208331,0.192684,...,0.017937,0.040359,0.026906,0.170404,0.035874,0.035874,0.040359,0.035874,0.067265,0.004484
8,0,Polyester,0.157643,0.215369,0.215369,0.108415,0.117228,0.21067,0.160834,0.096522,...,0.015748,0.023622,0.0,0.011811,0.027559,0.003937,0.007874,0.137795,0.007874,0.0
9,2,Cotton,0.243865,0.219233,0.066652,0.081323,0.243865,0.203103,0.047681,0.100346,...,0.027907,0.004651,0.060465,0.009302,0.004651,0.134884,0.018605,0.060465,0.074419,0.013953


### 🥒 PICKLE🚰📈

In [30]:
## Pickle the dataframe
train_df = df.copy()
train_df.columns = train_df.columns.astype(str)
PIXEL_COLS = train_df.columns.tolist()[:-3] # list of pixel header
LABEL_COLS = ['label', 'category'] # list of labels header
cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df.head()

curr_date = '0410_vectorized_non-aug' # Replace dd with date 
filename = f'./pkls/train_{curr_date}.pkl'
train_df.to_pickle(filename)