In [1]:
# base data sci libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# image processing libraries
from PIL import Image

np.random.seed(23)
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150

from utils.feature_utils import create_visual_vocab, extract_bovw_features, extract_hog_features, extract_wavelet_features, extract_log_features, extract_normals_features, extract_gabor_features


### To-Dos
* Finalize feature selections & tune features
* Once we have features, tweak and play with the grid-search CV to get some baseline results for our model
* tSNE
* Update Pickle once tune features ? 


### Done
* Add feature functions into cell with extract_log_features() mimicking the format and the instructions there
* Update parse_data() to include any features generated from the functions defined in the featurize block
* Add PCA and any other data exploration we want to complete for feature selection
* Decide on strategy in terms of flattening images or using scalar features -- what is worth spending computational umph on based on data exploration? - NO ?
* Run PCA on existing features 
* Pickle the feature df


Run the below parser to produce the pickle file, which parses the raw image data and generates several arrays. These arrays are then turned into a DF and sent to a pickle file down below.

### Parse Data to DF + Add In Features

In [2]:
def parse_all(folder_path):
    """ 
    Run all feature functions and create df of all feature representations. Scalar or Vector 
    
    Returns: 
        X (np.array): Feature values 
        Y (np.array): Categorical label for each image  
        unique_ids (np.array): image IDs  
    """

    image_vectors = []  # image data
    labels = []  # labels
    ids = []  # unique IDs
    features = []

    kmeans_fitted = create_visual_vocab(folder_path)

    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            parts = filename.split('_')
            fabType = parts[0]
            id1 = parts[1]
            id2 = parts[3].split('.')[0]  # Remove .png extension
            
            unique_id = id1 + id2

            image = Image.open(os.path.join(folder_path, filename)).convert('L')
            img_array = np.array(image)

            # sift expects a particular type of image so this needs to be done before 
            # the image is normalized for the other features. The output vector is between
            # 0 and 1 and should not impact PCA
            bovw_feature_vector = extract_bovw_features(img_array, kmeans_fitted)
            
            # normalize the image vector to be between 0 and 1 
            img_array_std = (img_array - np.mean(img_array)) / np.std(img_array)
            hog_feature_vector = extract_hog_features(img_array_std, 4, 20, scalar=False)
            normal_feature_vector = extract_normals_features(img_array_std, scalar=False)
            
            scalar_features = []
            scalar_features.extend([

                *hog_feature_vector,
                *normal_feature_vector,
                *bovw_feature_vector])
        
            scalar_features_array = np.array(scalar_features)

            image_vectors.append(scalar_features_array)
            labels.append(fabType)
            ids.append(unique_id)

    X = np.array(image_vectors)
    Y = np.array(labels)
    unique_ids = np.array(ids)
    return X, Y, unique_ids

In [3]:
# Driver for PARSE ALL 
folder_path = './Subsamples/train'

X, Y, unique_ids = parse_all(folder_path)

In [4]:
df = pd.DataFrame(X)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32317,32318,32319,32320,32321,32322,32323,category,label,uid
0,0.081365,0.129032,0.068879,0.02936,0.081483,0.182388,0.23357,0.056316,0.097077,0.121916,...,0.039062,0.03125,0.027344,0.046875,0.0625,0.050781,0.050781,Blended,0,8821c
1,0.232769,0.141162,0.130744,0.156622,0.161056,0.011751,0.021573,0.232769,0.232769,0.108755,...,0.024324,0.013514,0.021622,0.024324,0.043243,0.016216,0.07027,Denim,1,1503c
2,0.10064,0.221566,0.188653,0.113977,0.184139,0.221566,0.176351,0.147671,0.14715,0.221566,...,0.25,0.12963,0.194444,0.018519,0.00463,0.027778,0.00463,Polyester,2,16132c
3,0.178292,0.122044,0.151578,0.135832,0.111774,0.062205,0.173567,0.221095,0.116077,0.084465,...,0.007463,0.022388,0.044776,0.085821,0.074627,0.033582,0.029851,Blended,0,3621d
4,0.170737,0.112553,0.192354,0.185196,0.230044,0.112194,0.141223,0.150197,0.146034,0.09,...,0.047619,0.05102,0.081633,0.013605,0.044218,0.047619,0.054422,Cotton,3,2333a


In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA without specifying the number of components to retain all components
pca_full = PCA()
pca_full.fit(X_scaled)

# Calculate the cumulative explained variance ratio
cumulative_explained_variance_ratio = np.cumsum(pca_full.explained_variance_ratio_)

# Determine the number of components needed to explain at least 90% of the variance
n_components_90 = np.where(cumulative_explained_variance_ratio >= 0.9)[0][0] + 1

# Now apply PCA with the determined number of components
pca_reduced = PCA(n_components=n_components_90)
X_reduced = pca_reduced.fit_transform(X_scaled)

X_reduced.shape, n_components_90, cumulative_explained_variance_ratio


((22128, 6832),
 6832,
 array([0.03421738, 0.05567688, 0.07128493, ..., 1.        , 1.        ,
        1.        ]))

In [6]:
df = pd.DataFrame(X_reduced)
df['category'] = pd.Categorical(Y)
df['label'], _ = pd.factorize(df['category'])
df['uid'] = unique_ids
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6825,6826,6827,6828,6829,6830,6831,category,label,uid
0,216.875773,-21.220328,31.355705,113.803698,67.469517,51.739112,32.134906,-34.037195,-33.578267,8.550824,...,-0.616874,-0.858821,-0.282316,1.121574,0.521872,0.946164,-0.098814,Blended,0,8821c
1,-6.478218,-44.845392,9.334029,0.559372,-5.535237,2.069803,1.66805,-2.671548,4.473247,-2.290007,...,1.078271,0.301808,-1.118499,-0.406229,0.275195,-0.698774,0.043567,Denim,1,1503c
2,-21.765612,15.511845,0.440996,7.523796,-23.242091,-0.555019,-0.387093,2.955491,-0.883584,-5.710299,...,0.273425,-0.274362,0.532576,0.547632,-0.846333,0.320085,-1.747008,Polyester,2,16132c
3,1.031767,-13.747633,-35.65789,10.977205,4.696071,1.991713,0.73496,4.531688,-3.821167,1.52015,...,1.0455,0.740585,-0.987832,-1.463705,1.176062,0.128055,0.676635,Blended,0,3621d
4,-9.98767,-17.112746,-15.962529,-15.739822,-1.963229,-4.441544,0.110439,-1.643463,9.650931,-2.391829,...,-0.69228,-0.230699,1.451385,-1.054964,-0.210731,-0.705949,-0.167371,Cotton,3,2333a


### ðŸ¥’ PICKLEðŸš°ðŸ“ˆ

In [7]:
## Pickle the dataframe
train_df = df.copy()
train_df.columns = train_df.columns.astype(str)
PIXEL_COLS = train_df.columns.tolist()[:-3] # list of pixel header
LABEL_COLS = ['label', 'category'] # list of labels header
cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df.head()

curr_date = '0410_vectorized_non-aug' # Replace dd with date 
filename = f'./pkls/train_{curr_date}.pkl'
train_df.to_pickle(filename)