# Calculate Features

this notebook uses a CNN to extract features from the images in the directory. 
The results are saved to a csv.

In [15]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from datetime import datetime as dt


In [8]:
import utils

In [31]:
def prep_metadata_file(csv_fname_in, csv_fname_out = 'graphik_portal_{}.csv'):

    # load picture metadata
    src_fldr = os.path.join('..', 'data','raw','scraped')
    output_fldr = os.path.join('..', 'data','interim')
    csv_fpath = os.path.join(src_fldr, csv_fname_in)

    col_names = ['title','img_url','detail_url','detail_description','object_id', 'request_num']
    df = pd.read_csv(csv_fpath, index_col=0, header=None, na_values=['NaN'])
    df.columns = col_names

    #drop NA's
    orig_len = df.shape[0]
    df = df.dropna()
    num_dropped = orig_len - df.shape[0]
    print('dropped {:,} records due to NaN'.format(num_dropped))

    # remove one row where a header was inaccorectly inserted
    df = df.loc[df['object_id']!='object_id']

    # change object id to int and set as index
    df['object_id'] = df['object_id'].astype(np.int32)
    df = df.set_index('object_id')
    df = df.sort_index()

    # drop rows that do not have images in the processed folder
    img_fldr_path = os.path.join('..', 'data','processed','images')
    img_fpath_lst = utils.get_list_of_files_in_dir(img_fldr_path, file_types = ['jpg', 'jpeg','png'], keep_fldr_path=False)
    img_fpath_ser = pd.Series(img_fpath_lst, name='img_path')

    # remove any duplicate files
    img_fpath_ser = img_fpath_ser.drop_duplicates()

    # set index as object id number
    img_object_id = []

    for f  in img_fpath_ser:

        f_id = os.path.basename(f).split('.')[0]
        try:
            f_id = int(f_id)
            img_object_id.append(f_id)
        except:
            print(f_id)

    img_object_id = np.asarray(img_object_id, dtype=np.int64, order='C')
    img_fpath_ser.index = img_object_id

    # join series with image paths to df
    df = df.join(img_fpath_ser, how='left', sort=False)
    df = df.dropna(subset=['img_path',])

    # write interim result to file
    time_stamp = dt.now().strftime('%Y%m%d%H%M')
    csv_fname_out = 'graphik_portal_{}.csv'.format(time_stamp)

    output_fldr = os.path.join('..', 'data','interim')
    csv_fpath = os.path.join(output_fldr, csv_fname_out)

    df.to_csv(csv_fpath)
    print("wrote out csv {} with {:,} records".format(csv_fpath, df.shape[0]))

    return df

# prep metadata file

In [32]:
csv_fname_in = 'graphik_portal_results.csv'

df = prep_metadata_file(csv_fname_in)

dropped 5 records due to NaN
found 9994 existing images
wrote out csv ../data/interim/graphik_portal_202010081333.csv with 9,994 records


# create tensorflow dataset