## Extract Features from CNN to OPM Data

Extracts features to OPM data. 
* __Inputs:__ (1) DTL data around OPM survey points (previous script pre-processes as numpy arrays) and (2) CNN model 
* __Output:__ Dataframe with OPM survey ID and CNN features

## Setup

In [1]:
import os, math, pickle, datetime, json
import numpy as np
import pandas as pd
#import geopandas as gpd
import json
#from rasterio.plot import show
import pickle

#from geopandas import GeoDataFrame
#from shapely.geometry import Point

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, RandomForestClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, classification_report)

from keras.models import load_model
from keras.models import Sequential, Model

from tensorflow.keras.applications.vgg16 import preprocess_input

import warnings
import random
import tensorflow as tf
warnings.filterwarnings('ignore')

## User Defined
import config as cf
import feature_extraction as fe

import boto3
from sagemaker import get_execution_role
from s3fs.core import S3FileSystem 
s3 = S3FileSystem()
role = get_execution_role()


bucket = 'worldbank-pakistan-data'
LOCAL_DIR = '/home/ec2-user/SageMaker/'

Using TensorFlow backend.


## Functions

In [2]:
def save_to_file(obj, path):
    '''
    Saves passed obj as a pickle to given filepath.
    '''
    with open(path, 'wb') as f:
        pickle.dump(obj=obj,
                    file=f,
                    protocol=pickle.HIGHEST_PROTOCOL)
    return None


def perform_pca(df, n):
    '''
    Performs PCA with n compponents on all columns in df.
    '''
    pca = PCA(n_components=n)
    pca.fit(df)
    features_pca = pca.transform(df)
    column_names = ['pc_%01d' %i for i in range(0,n)]
    df_features_pca = pd.DataFrame(data=features_pca, columns=column_names)
    return df_features_pca


def normalize(x_train, x_test):
    '''
    Normalize data.
    '''
    x_scaler = StandardScaler().fit(x_train)
    for df in (x_train, x_test):
        x_scaler.transform(df)

def extract_features_to_pd(param_name, bands_type, cnn_filename, year, suffix, bucket):

    # 1. Load Data
    if bands_type == "RGB":
        DTL = np.load(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_bands' + 'RGB' + "_" + str(year) + '.npy'))))
        bisp_df = pd.read_pickle(s3.open('{}/{}'.format(bucket, os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_dtl_uids_bands' + 'RGB' + "_" + str(year) + '.pkl'))))
        
    # 2. Extract features
    layer_name = 'fc1'

    model = load_model(s3.open('{}/{}'.format(bucket, os.path.join('CNN', param_name, cnn_filename))))

    DTL_p = preprocess_input(DTL) # Preprocess image data

    #DTL_p = DTL_p[1:5,:,:,:] # for testing

    # Generate feature extractor using trained CNN
    feature_extractor = Model(inputs=model.inputs,
                              outputs=model.get_layer(name=layer_name).output,)

    features = feature_extractor.predict(DTL_p)

    # 3. Create and format pandas DataFrame
    df = pd.DataFrame(features).add_prefix('cnn_feat_')
    df['uid'] = bisp_df.uid
               
    # 4. Export / send to s3      
    #df.to_pickle(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl'))
    df.to_csv(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv'))

    #boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl')).upload_file(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.pkl'))
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('OPM', 'FinalData', 'Individual Datasets', 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv')).upload_file(os.path.join(LOCAL_DIR, 'bisp_cnn_features_all_' + param_name + "_" + str(year) + suffix + '.csv'))

    return df

In [3]:
tmp = extract_features_to_pd("Nbands3_nNtlBins3_minNTLbinCount100", "RGB", 'script_CNN_2014.h5', 2014, '', bucket)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


