Ultraleap Image Texture Prediction Model © Ultraleap Limited 2020

Licensed under the Ultraleap closed source licence agreement; you may not use this file except in compliance with the License.

A copy of this License is included with this download as a separate document. 

Alternatively, you may obtain a copy of the license from: https://www.ultraleap.com/closed-source-licence/

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

# TexNet Data Pre-Processing
This Jupyter notebook takes 2 separate data inputs (perceptual data, and image information data), and creates a processed and concatenated data frame that is ready to be passed to the TexNet model as a .csv file. The following Python libraries are required in order to run this notebook.

* Pandas (0.25.1)

In [None]:
import sys
!{sys.executable} -m pip install pandas==0.25.1

## Initialisation
Firstly, we initialise a few Python library dependencies.

In [None]:
import pandas as pd
import numpy as np
import os, time

## Data Prepping Functions
Next, we define some small utility functions to aid data processing.

In [None]:
def value_range_scaler(current_value, old_min, old_max, new_min, new_max):
    '''Function rescales values to new range.
    '''
    return ((current_value - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

In [None]:
def check_outliers_iqr(data_frame_col):
    '''Function takes data frame column and checks for outliers using IQR.
    '''
    Q1 = data_frame_col.quantile(0.25)
    Q3 = data_frame_col.quantile(0.75)
    IQR = Q3 - Q1
    return (data_frame_col < (Q1 - 1.5 * IQR)) | (data_frame_col > (Q3 + 1.5 * IQR))

In [None]:
def datetime(data_frame):
    '''Function to obtain lengths of individual studies, in minutes and seconds.
    '''
    data_frame["End Date"] = pd.to_datetime(data_frame["End Date"])
    data_frame["Start Date"] = pd.to_datetime(data_frame["Start Date"])
    data_frame["time_seconds"] = np.abs(data_frame["End Date"] - data_frame["Start Date"])
    data_frame["time_seconds"] = data_frame["time_seconds"] / np.timedelta64(1, 's')
    return data_frame

In [None]:
def combine_and_filter_data(perceptual_data, image_data, dimensions, outlier_filter):
    """This function concatentates the image data and perceptual data obtained from 
    our Amazon Mechanical Turk visual roughness perception study into one large data 
    frame in the correct format.
    
    Args:
        perceptual_data - perceptual data as a Pandas dataframe.
        image_data - image data as a Pandas dataframe.
        dimensions - list of dimensions that should be appended to the output .csv file.
        outlier_filter - boolean to choose whether or not outliers should be filtered based on Median and IQR.

    Returns:
        data_frame - a concatenated and filtered Pandas dataframe.
    """
    perceptual_data_clean = datetime(perceptual_data)
    perceptual_data_clean = perceptual_data[perceptual_data_clean['time_seconds'] < 10800]
    image_name_list = image_data['Name']
    groups = image_data['Group']

    data_frame_collector = {}
    for i, dimension in enumerate(dimensions):
        new_data_frame = perceptual_data_clean.filter(like=str.title(dimension))
        new_data_frame = new_data_frame.T
        new_data_frame.index = image_name_list
        new_data_frame['group'] = image_data['Group'].values
        new_data_frame = new_data_frame[~new_data_frame.index.str.contains('_copy')]
        new_data_frame = new_data_frame.sort_index(axis=0)
        groups = new_data_frame['group']
        new_data_frame = new_data_frame.drop(columns='group')

        outlier_list = []

        for col in new_data_frame.columns[:]:
            value_list = new_data_frame[col].values
            new_data_frame[col] = value_range_scaler([value for value in value_list],
                                                     new_data_frame[col].values.min(),
                                                     new_data_frame[col].values.max(),
                                                     0, 100)

            outliers = check_outliers_iqr(new_data_frame[col])
            outlier_list.append(outliers)

        new_data_frame = new_data_frame.T
        new_data_frame_unstacked = pd.concat([new_data_frame.unstack().rename("{}".format(dimension))], axis=1)
        outlier_list = [item for sublist in outlier_list for item in sublist]

        if outlier_filter:
            new_data_frame_unstacked = new_data_frame_unstacked[~new_data_frame_unstacked[dimension].isin(
                outlier_list)]

        new_data_frame_unstacked.index.names = ['tex_name', 'user_no']
        new_data_frame_unstacked['tex_name'] = new_data_frame_unstacked.index.get_level_values('tex_name')
        new_data_frame_unstacked = new_data_frame_unstacked.reset_index(drop=True)

        means = []
        medians = []
        stds = []

        for tex in new_data_frame_unstacked.tex_name.unique():
            mean = new_data_frame_unstacked.groupby(
                [new_data_frame_unstacked.tex_name == tex])[dimension].mean().tolist()
            means.append(mean[1])

            median = new_data_frame_unstacked.groupby(
                [new_data_frame_unstacked.tex_name == tex])[dimension].median().tolist()
            medians.append(median[1])

            std = new_data_frame_unstacked.groupby(
                [new_data_frame_unstacked.tex_name == tex])[dimension].std().tolist()
            stds.append(std[1])

        new_data_frame = new_data_frame.T
        new_data_frame['mean'] = means
        new_data_frame['std'] = stds
        new_data_frame['median'] = medians
        new_data_frame.columns = ['{}{}'.format(c, '_' + dimension) for c in new_data_frame.columns]
        new_data_frame.columns = new_data_frame.columns.str.lower()
        new_data_frame.index.name = 'tex_name'

        data_frame_collector[i] = pd.DataFrame(new_data_frame)

    data_frame_combined = pd.concat(data_frame_collector.values(), sort=False, axis=1)
    data_frame = pd.concat([groups, data_frame_combined], axis=1)

    return data_frame

In [None]:
def export_prepped_model_data(perceptual_data, image_data, dimensions, 
                              outlier_filter=None, store_data=None):
    """
    This function will output a filtered data frame and store it in the folder, 'input_data', 
    ready to be used for training the TexNet model. 
    
    Args:
        perceptual_data - this is the path to the necessary .csv file containing data from our 
                          Amazon Mechanical Turkvisual perception user study.
        image_data - the path to the .csv file that contains image data information (names etc).
        dimensions - list of dimensions that should be aggregated and appended to the output .csv file.
        outlier_filter - boolean to determine if data outliers should be filtered via inter-quartile range.
        store_data - boolean to state whether data should be stored as an output .csv file.
        
    Returns:
        filtered_data - returns a prepared Pandas data frame that contains the filtered perceptual data 
                        for each selected texture dimension passed to the function.
    """

    # Get perceptual data .csv file and make a data frame.
    if not os.path.exists(perceptual_data):
        print("The file %s does not exist!" % perceptual_data)
    if isinstance(perceptual_data, pd.DataFrame):
        pass
    else:
        perceptual_data = pd.read_csv(r"{}".format(perceptual_data), error_bad_lines=False)

    # Get the image info .csv file and make a data frame.
    if not os.path.exists(image_data):
        print("The file %s does not exist!" % image_data)
    if isinstance(image_data, pd.DataFrame):
        pass
    else:
        image_data = pd.read_csv(r"{}".format(image_data), error_bad_lines=False)

    """Check the input dimensions list, if user has passed the value 'all', then filtering will take place on all
    texture dimensions.
    """
    if dimensions == 'all':
        dimensions = ['roughness', 'stickiness', 'bumpiness', 'hardness', 'warmness']
    else:
        dimensions = [dimensions]

    # A data frame is returned that has been filtered and concatenated.
    filtered_data = combine_and_filter_data(perceptual_data, image_data, dimensions, outlier_filter)

    # Time information stored for file saving.
    timestr = time.strftime("%Y%m%d-%H%M%S")

    # Input dimensions converted to list
    dim_list = "_".join([dimension for dimension in dimensions])

    """Check to see if data should be stored and whether outlier filter has been selected as an additional
    filtering step.
    """
    if store_data and outlier_filter:
        name = 'input_data/{}_{}_data_outlier_filtered.csv'.format(timestr, dim_list)
        name = name.replace('[', '').replace(']', '').replace("'", '').replace(",", "").replace(" ", "")
        filtered_data.to_csv(name, sep=',')  # Store the file.
        print("File {} has been compiled and stored in  '..input_data/'.".format(name))
    elif store_data:
        name = 'input_data/{}_{}_data.csv'.format(timestr, dim_list)
        name = name.replace('[', '').replace(']', '').replace("'", '').replace(",", "").replace(" ", "")
        filtered_data.to_csv(name, sep=',')  # Store the file.
        print("File {} has been compiled and stored in '..input_data/'.".format(name))
    return filtered_data

## Preparing the data and exporting to a .CSV file.
Now that we have defined the helper functions that will prepare and concatenate our data sets, we need to provide references to the data itself, then run it all through each function, and finally output the .csv file to a location of our choosing.

First, we define the locations of each of our separate data sets:

In [None]:
perceptual_data = 'input_data/perceptual_data.csv'
image_data = 'input_data/image_data.csv'
dimensions = 'all'

## Running the functions and creating the .csv file.
The final step is to simply run the `export_prepped_model_data` script with the correctly supplied input data sets and the output data frame will be created and stored in the, 'input_data' folder.

In [None]:
test = export_prepped_model_data(perceptual_data, image_data, dimensions, outlier_filter = True, store_data=True)