In [93]:
# run_grid.py
#
# Description:
# Performs complete process of cleaning data and prepping all necessary data 
# and running the grid search.

# https://towardsdatascience.com/extract-features-visualize-filters-and-feature-maps-in-vgg16-and-vgg19-cnn-models-d2da6333edd0


import os
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA

## User Defined
import config as cf

In [97]:
def perform_pca_n(df, n):
    '''
    Performs PCA with n compponents on all columns in df.
    '''
    pca = PCA(n_components=n)
    pca.fit(df)
    features_pca = pca.transform(df)
    column_names = ['cnn_pc_%01d' %i for i in range(0,n)]
    df_features_pca = pd.DataFrame(data=features_pca, columns=column_names)
    return df_features_pca, pca.explained_variance_ratio_

def perform_pca_expln(df, prop):
    '''
    Performs PCA with n compponents on all columns in df.
    '''
    pca = PCA(prop)
    pca.fit(df)
    features_pca = pca.transform(df)

    n = features_pca.shape[1]

    column_names = ['cnn_pc_%01d' %i for i in range(0,n)]
    df_features_pca = pd.DataFrame(data=features_pca, columns=column_names)
    
    return df_features_pca, pca.explained_variance_ratio_

def extract_pca_features(df):

    n_components = 10
    pca_dict = {}
    suffix = ["_Nbands3","_Band1","_Band5","_Band6","_Band7"]
    column_names = pd.Series(df.columns)
    opm_cols = column_names[~column_names.str.contains('cnn_feat', regex=False)]
    cols = [cols for cols in opm_cols]
    
    for i in range(len(suffix)):
        df2 = df.filter(regex=suffix[i])
        pca_df, expl_var = perform_pca_expln(df2, 0.9)
        pca_dict[suffix[i]] = pca_df
        pca_dict[suffix[i]] = pca_dict[suffix[i]].add_suffix(suffix[i])
    df_pca = pd.concat(pca_dict, axis=1)
    df_pca[cols] = df[cols]
    
    # 5. Export           
    df_pca.to_pickle(os.path.join('s3://worldbank-pakistan-data', 'OPM' , 'FinalData', 'Merged Datasets', 'cnn_merge_pca.pkl'))
    df_pca.to_csv(os.path.join('s3://worldbank-pakistan-data', 'OPM' , 'FinalData', 'Merged Datasets', 'cnn_merge_pca.csv'))

In [98]:
# 1. Load Data
df = pd.read_csv(os.path.join('s3://worldbank-pakistan-data', 'OPM' , 'FinalData', 'Merged Datasets', 'cnn_merge.csv'))
extract_pca_features(df)