## Extract Features from CNN to OPM Data

Extracts features to OPM data. 
* __Inputs:__ (1) DTL data around OPM survey points (previous script pre-processes as numpy arrays) and (2) CNN model 
* __Output:__ Dataframe with OPM survey ID and CNN features

Notes:
Might encounter an error that says "SystemError: <class 'int'> returned a result with an error set".

## Setup

In [2]:
import os, math, pickle, datetime, json
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#from rasterio.plot import show
import pickle

import re

#from geopandas import GeoDataFrame
#from shapely.geometry import Point

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report)

from keras.models import load_model
from keras.models import Sequential, Model

from tensorflow.keras.applications.vgg16 import preprocess_input

import warnings
import random
import tensorflow as tf
warnings.filterwarnings('ignore')

## User Defined
import config as cf
import feature_extraction as fe

## Functions

In [3]:
def normalize(x_train, x_test):
    '''
    Normalize data.
    '''
    x_scaler = StandardScaler().fit(x_train)
    for df in (x_train, x_test):
        x_scaler.transform(df)

def extract_features_to_pd(param_name, bands_type, cnn_filename, year, suffix, bucket):

    # 1. Load Data
    if bands_type == "RGB":
        DTL = np.load(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_bands' + 'RGB' + "_" + str(year) + '.npy'))))
        bisp_df = pd.read_pickle(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_bands' + 'RGB' + "_" + str(year) + '.pkl'))))
    if bands_type == "Single":
        DTL = np.load(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_band' + str(band_numbers) + "_" + str(year) + '.npy'))))
        bisp_df = pd.read_pickle(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_band' + str(band_numbers) + "_" + str(year) + '.pkl'))))
        DTL = np.repeat(DTL,3,-1) #have to trick tensorflow into thinking this is RGB
    # 2. Extract features
    layer_name = 'fc1'
    import h5py #workaround to work with AWS S3 bucket
    h5file = s3.open('{}/{}'.format(bucket, os.path.join('CNN', param_name, cnn_filename)))
    f = h5py.File(h5file)
    model = load_model(f)
    
    DTL_p = preprocess_input(DTL) # Preprocess image data

    #DTL_p = DTL_p[1:5,:,:,:] # for testing

    # Generate feature extractor using trained CNN
    feature_extractor = Model(inputs=model.inputs,
                              outputs=model.get_layer(name=layer_name).output,)

    features = feature_extractor.predict(DTL_p)

    # 3. Create and format pandas DataFrame
    df = pd.DataFrame(features).add_prefix('cnn_feat_')
    df['uid'] = bisp_df.uid
               
    # 4. Export / send to s3      
    #df.to_pickle(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl'))
    df.to_csv(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv'))

    #boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl')).upload_file(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv'))
    print('Saved') #function works if we get to this point
    return df

In [73]:
SURVEY = 'OPM'

# Prep file lists ---------------------------------------------
CNN_FOLDER_VIIRS = os.path.join(cf.GD_CNN_DIRECTORY, 'VIIRS')
CNN_FOLDER_SURVEY = os.path.join(cf.GD_CNN_DIRECTORY, SURVEY)

cnn_files_viirs = os.listdir(os.path.join(CNN_FOLDER_VIIRS))
cnn_files_survey = os.listdir(os.path.join(CNN_FOLDER_VIIRS))

reg_npy = re.compile(r'^dtl.*npy$')   
reg_h5 = re.compile(r'CNN_.*h5$')    
npy_files = list(filter(reg_npy.search, cnn_files_survey)) # grab dtl data extracted from survey
h5_files = list(filter(reg_h5.search, cnn_files_viirs)) # grab models trained on viirs 

# Loop through files, load & extract data -------------------------------
for dtl_file_i in dtl_files:

    # Grab all h5 files that use that DTL layer (eg, different dep vars)
    dtl_file_i_clean = dtl_file_i.replace('.npy', '')
    reg_dtl_i = re.compile(r''.join(dtl_file_i_clean))
    h5_files_for_dtl_i = list(filter(reg_dtl_i.search, h5_files)) 

    # Loop through h5 files
    for h5_i in h5_files_for_dtl_i:
        print(h5_i)

        # Load Data ---------------------------------------------   
        model = load_model(os.path.join(CNN_FOLDER_VIIRS, h5_i))
        DTL = np.load(os.path.join(CNN_FOLDER_SURVEY, dtl_file_i))
        gdf = pd.read_pickle(os.path.join(CNN_FOLDER_SURVEY, 'dep_var.pkl'))

        if DTL.shape[3] == 1:
            DTL = np.repeat(DTL, 3, -1)
            
        # Extract Data ---------------------------------------------   
        DTL_p = preprocess_input(DTL)
        
        feature_extractor = Model(inputs=model.inputs,
                          outputs=model.get_layer(name='fc1').output,)

        features = feature_extractor.predict(DTL_p)

        cnn_df = pd.DataFrame(features).add_prefix('cnn_feat_')
        cnn_df['uid'] = gdf.uid
        
        # Export CNN Features --------------------------------------
        csv_out = h5_i.replace('.h5', '_cnn_features.csv')
        cnn_df.to_csv(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', SURVEY, 'FinalData', 'Individual Datasets', csv_out), index=False) 