# Python 3 code for preprocessing of 'fish' data

In [2]:
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype
import numpy as np
from scipy.interpolate import interp1d
from tsfresh import select_features
from tsfresh import extract_features
from tsfresh import extract_relevant_features
import math, time
import pickle
from tsfresh.utilities.dataframe_functions import impute
import matplotlib.pyplot as plt

In [3]:
# Read in CSV file-
def csv_to_pandas(path_to_file):
    '''
    A function to read CSV file into a Pandas DataFrame-
    Expects complete path/relative path to CSV file along with file name
    '''
    # data = pd.read_csv("fish-5.csv")

    try:

        if path_to_file[-3:] == 'csv':
            data = pd.read_csv(path_to_file)
        else:
            data = pd.read_csv(path_to_file + '.csv')
            
            # Check if 'time' attribute is integer-
            if is_numeric_dtype(data['time']):
                data.sort_values('time', ascending = True, inplace = True)
            # Check if 'time' attribute is string-
            elif is_string_dtype(data['time']):
                data['time'] = pd.to_datetime(data['time'])
                data.sort_values('time', ascending = True, inplace = True)


        return data

    except FileNotFoundError:
        print("Your file below could not be found. Please check path and/or file name and try again.\nPath given: {0}\n\n".format(path_to_file))

### csv_to_pandas() function:
This function takes as argument the COMPLETE PATH to where your CSV file is kept which you wish to preprocess.
The function returns the CSV file as a Pandas DataFrame object.

Also, the function will parse 'time' attribute and do either of two things, viz.,
1.) If 'time' attribute is integer, it will sort them according to 'time' in an ascending manner
2.) If 'time' attribute is string, it will first convert it to 'datetime' format and then sort it in an ascending manner

In [4]:
# Ask for the COMPLETE PATH to CSV file as input from user-
path_to_file = input("Enter path to data file: ")

Enter path to data file: datasets/fish-5.csv


In [5]:
data = csv_to_pandas(path_to_file)

In [6]:
# Print first 10 rows of dataset-
data.head(10)

Unnamed: 0,time,animal_id,x,y
0,1,312,405.29,417.76
1,1,511,369.99,428.78
2,1,607,390.33,405.89
3,1,811,445.15,411.94
4,1,905,366.06,451.76
5,2,312,405.31,417.37
6,2,511,370.01,428.82
7,2,607,390.25,405.89
8,2,811,445.48,412.26
9,2,905,365.86,451.76


In [7]:

def data_preprocessing(data):
    '''
    A function to perform data preprocessing
    Expects 'data' as input which is the Pandas DataFrame to be processed
    '''
    
    print("\nThe dimensions/shape of the raw data file is: {0}\n".format(data.shape))
    print("\nNumber of unique animals in raw data are: {0}\n".format(data['animal_id'].nunique()))

    print("\nNumber of rows in data having missing values for 'time' attribute are = {0}\n".format(len(list(data[data['time'].isnull()].index))))
    print("\nNumber of rows in data having missing values for 'animal_id' attribute are = {0}\n".format(len(list(data[data['animal_id'].isnull()].index))))
    print("\nRows having missing values for 'time' and/or 'animal_id' will be deleted.\n")

    # Check if 'time' attribute has missing values
    # If yes, delete all rows having missing values
    if data['time'].isnull().values.any():
        data = data[pd.notnull(data['time'])]


    # Check if 'animal_id' attribute has missing values
    # If yes, delete all rows having missing values
    if data['animal_id'].isnull().values.any():
        data = data[pd.notnull(data['animal_id'])]


    # Find duplicate rows based on 'time' & 'animal_id' attributes-
    duplicate_rows = data[data.duplicated(subset=['x', 'y'], keep = 'first')]

    # Get indices for duplicate rows-
    # duplicate_rows.index
    # OR-
    # list(duplicate_rows.index)

    print("\nNumber of duplicate rows in data for 'x' & 'y' attributes are = {0}\n".format(len(list(duplicate_rows.index))))
    print("\nDuplicate rows for 'x' & 'y' attributes will be removed.\n")

    # Remove the duplicated rows found above-
    data.drop(axis = 0, index=list(duplicate_rows.index), inplace = True)

    
    # Return processed data-
    return data

### data_preprocessing() function:
This function takes as input the Pandas DataFrame generated by csv_to_pandas() function.

It prints the following things
1.) Dimension/shape of the raw data file
2.) Number of rows in data having missing values for 'time' attribute
3.) Number of rows in data having missing values for 'animal_id' attribute
4.) Number of unique animals in raw data

Note: Rows having missing values for 'time' and/or 'animal_id' will be deleted!

The function also finds duplicate rows based on 'time' & 'animal_id' attributes, and deletes the duplicate occurrences except the first data point.

Returns the processed data as Pandas DataFrame

In [8]:
# Example of using 'data_preprocessing()' function-
processed_data = data_preprocessing(data)


The dimensions/shape of the raw data file is: (5000, 4)


Number of unique animals in raw data are: 5


Number of rows in data having missing values for 'time' attribute are = 0


Number of rows in data having missing values for 'animal_id' attribute are = 0


Rows having missing values for 'time' and/or 'animal_id' will be deleted.


Number of duplicate rows in data for 'x' & 'y' attributes are = 26


Duplicate rows for 'x' & 'y' attributes will be removed.



In [9]:
def linear_interpolation(data, threshold):
    '''
    Function to interpolate missing values for 'x' and 'y' attributes
    in dataset.
    'threshold' parameter decides the number of rows till which, data
    should NOT be deleted.
    '''

    # Get indices of missing values for 'x' attribute in a list-
    missing_x_values = list(data_missing[data_missing['x'].isnull()].index)

    # Get indices of missing values for 'y' attribute in a list-
    missing_y_values = list(data_missing[data_missing['y'].isnull()].index)

    print("\nNumber of missing values in 'x' attribute = {0}".format(len(missing_x_values)))
    print("Number of missing values in 'y' attribute = {0}\n".format(len(missing_y_values)))
    
    # Find sequences of missing values in 'missing_y_values'-
    # counter for outer loop-
    i = 0

    # counter for inner loop-
    j = 0

    # start and end counters-
    start = end = 0

    # count length of sequence found-
    k = 1

    # threshold = 10

    while i < (len(n) - 1):
        start = end = i
        k = 1
        j = i

        # print("i = {0} & j = {1}".format(i, j))

        while j < (len(n) - 1):
            if n[j] + 1 == n[j + 1]:
                k += 1
                j += 1
                end = j
            else:
                # i = j + 1
                break

        i = j + 1

        if k > 1:
            print("\nSequence length = {0}. Start = {1} & End = {2}".format(k, start, end))

        if k >= threshold:
            print("\nDelete sequence from {0} to {1}\n".format(start, end))
    

### linear_interpolation() function:
This function does linear interpolation (as the name suggests) for missing values for 'x' and 'y' attributes in data.
The function returns the processed Pandas DataFrame which is provided to it as input.

In [10]:
def grouping_data(processed_data):
    '''
    A function to group all values for each 'animal_id'
    Input is 'processed_data' which is processed Pandas DataFrame
    Returns a dictionary where-
    key is animal_id, value in Pandas DataFrame for that 'animal_id'
    '''
    # A dictionary object to hold all groups obtained using group by-
    data_animal_id_groups = {}

    # Group by using 'animal_id' attribute-
    data_animal_id = processed_data.groupby('animal_id')

    # Get each animal_id's data from grouping performed-
    for animal_id in data_animal_id.groups.keys():
        data_animal_id_groups[animal_id] = data_animal_id.get_group(animal_id)

    # To reset index for each group-
    for animal_id in data_animal_id_groups.keys():
        data_animal_id_groups[animal_id].reset_index(drop = True, inplace = True)

    # Add additional attributes/columns to each groups-
    for aid in data_animal_id_groups.keys():
        data = [0 for x in range(data_animal_id_groups[aid].shape[0])]
    
        data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(Distance = data)
        data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(Average_Speed = data)
        data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(Average_Acceleration = data)
        data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(Direction = data)

    return data_animal_id_groups

### grouping_data() function:
The function takes as input a Pandas DataFrame which is processed data obtained by using the functions defined above.
It groups data according to 'animal_id' attribute/feature/column.

The function a Python 3 dictionary, where the key is an 'animal_id' and value is a Pandas DataFrame corresponding to that 'animal_id'

In [11]:
# Example usage-
data_animal_id_groups = grouping_data(processed_data)

In [12]:
type(data_animal_id_groups)

dict

In [13]:
# Print first 5 rows for animal_id = 312
data_animal_id_groups[312].head()

Unnamed: 0,time,animal_id,x,y,Distance,Average_Speed,Average_Acceleration,Direction
0,1,312,405.29,417.76,0,0,0,0
1,2,312,405.31,417.37,0,0,0,0
2,3,312,405.31,417.07,0,0,0,0
3,4,312,405.3,416.86,0,0,0,0
4,5,312,405.29,416.71,0,0,0,0


In [14]:
def compute_distance_and_direction(data_animal_id_groups):
    '''
    Calculate the metric distance and direction between two consecutive time
    frames/time stamps for each moving entity (in this case, fish)
    '''

    start_time = time.time()

    for aid in data_animal_id_groups.keys():
        print("\nComputing Distance & Direction for Animal ID = {0}\n".format(aid))

        # for i in range(1, animal_id.shape[0] - 1):
        for i in range(1, data_animal_id_groups[aid].shape[0] - 1):
            # print("Current i = ", i)

            x1 = data_animal_id_groups[aid].iloc[i, 2]
            y1 = data_animal_id_groups[aid].iloc[i, 3]
            x2 = data_animal_id_groups[aid].iloc[i + 1, 2]
            y2 = data_animal_id_groups[aid].iloc[i + 1, 3]

            # Compute distance between 2 points-
            distance = math.sqrt(math.pow((x2 - x1), 2) + math.pow((y2 - y1), 2))

            # Compute the direction in DEGREES-
            direction = math.degrees(math.atan((y2 - y1) / (x2 - x1)))
            if math.isnan(direction):
                data_animal_id_groups[aid].loc[i, 'Direction'] = 0
                # animal_id.loc[i, 'Direction'] = 0
            else:
                data_animal_id_groups[aid].loc[i, 'Direction'] = direction
                # animal_id.loc[i, 'Direction'] = direction

            # Insert computed distance to column/attribute 'Distance'-
            # animal_id.loc[i, 'Distance'] = distance
            data_animal_id_groups[aid].loc[i, 'Distance'] = distance


    end_time = time.time()
    print("\nTime taken to create distance & direction data = {0:.4f} seconds\n\n".format(end_time - start_time))
    # Time taken to create distance & direction data = 1013.1692 seconds

    return data_animal_id_groups


### compute_distance_and_direction() function:
This function calculates the metric distance and direction between two consecutive time frames/time stamps for each moving entity (in this case, fish).

It takes as input a Python 3 dictionary object which was created using 'grouping_data()' function.
It returns the Python 3 dictionary object which was passed on to it as function argument.

Note: Since, this can be a computationally expensive task, dependent on the dataset being used, the function also prints the amount of time taken to accomplish the task of computing distance and direction.

In [15]:
# Example Usage-
direction_distance_data = compute_distance_and_direction(data_animal_id_groups)


Computing Distance & Direction for Animal ID = 312






Computing Distance & Direction for Animal ID = 511


Computing Distance & Direction for Animal ID = 607


Computing Distance & Direction for Animal ID = 811


Computing Distance & Direction for Animal ID = 905


Time taken to create distance & direction data = 3.5896 seconds




In [16]:
def compute_average_speed(data_animal_id_groups, fps):
    '''
    A function to compute average speed of an animal based on fps
    (frames per second) parameter. Calculate the average speed of a mover,
    based on the pandas dataframe and a frames per second (fps) parameter

    Formula used-
    Average Speed = Total Distance Travelled / Total Time taken
    '''

    start_time = time.time()

    for aid in data_animal_id_groups.keys():
        print("\nComputing Average Speed for Animal ID = {0}\n".format(aid))


        # for i in range (1, animal_id.shape[0] - fps + 1):
        for i in range(1, data_animal_id_groups[aid].shape[0] - fps + 1):
            # print("Current i = ", i)
            
            tot_dist = 0	# total distance travelled
            
            for j in range(i, i + fps):
                # tot_dist += animal_id.loc[j, "Distance"]
                tot_dist += data_animal_id_groups[aid].loc[j, "Distance"]

            # animal_id.loc[i, "Average_Speed"] = (tot_dist / fps)
            data_animal_id_groups[aid].loc[i, "Average_Speed"] = (tot_dist / fps) 

    end_time = time.time()
    print("\nTime taken to create Average Speed data = {0:.4f} seconds.\n".format(end_time - start_time))

    return data_animal_id_groups

### compute_average_speed() function:
This function takes as input a Python 3 dictionary generated from 'compute_distance_and_direction()' function
and a 'fps' or frames per second parameter.

It computes the average speed of an animal based on fps (frames per second) parameter.
The formula used-
Average Speed = Total Distance Travelled / Total Time taken

It returns the Python 3 dictionary provided to it as the first argument.
This function also prints the time taken to finish task as it can be computationally expensive.

In [17]:
# Example Usage-
avg_speed_data = compute_average_speed(direction_distance_data, 3)

# NOTE: Here fps = 3


Computing Average Speed for Animal ID = 312


Computing Average Speed for Animal ID = 511


Computing Average Speed for Animal ID = 607


Computing Average Speed for Animal ID = 811


Computing Average Speed for Animal ID = 905


Time taken to create Average Speed data = 1.9510 seconds.



In [18]:
def compute_average_acceleration(data_animal_id_groups, fps):
	'''
	A function to compute average acceleration of an animal based on fps
	(frames per second) parameter.

	Formulas used are-
	Average Acceleration = (Final Speed - Initial Speed) / Total Time Taken
	'''

	start_time = time.time()

	for aid in data_animal_id_groups.keys():
		print("\nComputing Average Speed for Animal ID = {0}\n".format(aid))


		# for i in range (1, animal_id.shape[0] - fps + 1):
		for i in range(1, data_animal_id_groups[aid].shape[0] - fps + 1):
			# print("Current i = ", i)

			avg_speed = 0

			# Calculating Average Speed-
			avg_speed = data_animal_id_groups[aid].loc[i, 'Average_Speed'] - data_animal_id_groups[aid].loc[i + 1, 'Average_Speed']
			# avg_speed = animal_id.loc[i, "Average_Speed"] - animal_id.loc[i + 1, "Average_Speed"]
			# print("\navg_speed = {0:.4f}\n".format(avg_speed))
			# animal_id.loc[i, "Average_Acceleration"] = (avg_speed / fps)
			data_animal_id_groups[aid].loc[i, 'Average_Acceleration'] = (avg_speed / fps)

	end_time = time.time()
	print("\nTime taken to create Average Acceleration data = {0:.4f} seconds.\n".format(end_time - start_time))
	# Total time taken = 37.8197 seconds.

	# Concatenate all Pandas DataFrame into one-
	result = pd.concat(data_animal_id_groups[aid] for aid in data_animal_id_groups.keys())

	# Reset index-
	result.reset_index(drop=True, inplace=True)

	return result


### compute_average_acceleration() function:
This function takes as input a Python 3 dictionary generated from 'compute_average_speed_direction()' function and a 'fps' or frames per second parameter.

It computes the average acceleration of an animal based on fps (frames per second) parameter. The formula used- Average Acceleration = (Final Speed - Initial Speed) / Total Time Taken

It returns a Pandas DataFrame after concatenating all DataFrames according to animal_id. This function also prints the time taken to finish task as it can be computationally expensive.

In [19]:
# Example usage-
avg_acceleration_data = compute_average_acceleration(avg_speed_data, 3)

# NOTE: fps here is 3


Computing Average Speed for Animal ID = 312


Computing Average Speed for Animal ID = 511


Computing Average Speed for Animal ID = 607


Computing Average Speed for Animal ID = 811


Computing Average Speed for Animal ID = 905


Time taken to create Average Acceleration data = 1.7515 seconds.



In [20]:
# Optional: Save processed and computed data to HDD-
avg_acceleration_data.to_csv("Fish-Complete_Data.csv", index = False)

## Use 'tsfresh' package to extract statistics relating to time series data:

In [21]:
# For extracting all time series related features, do-
extracted_features = extract_features(avg_acceleration_data, column_id = 'animal_id', column_sort = 'time')

Feature Extraction: 100%|██████████| 10/10 [00:31<00:00,  2.94s/it]


In [22]:
# We will now remove all NaN values (that were created by feature calculators,
# that can not be used on the given data, e.g. because it has too low statistics)
# and select only the relevant features next-
impute(extracted_features)

variable,Average_Acceleration__abs_energy,Average_Acceleration__absolute_sum_of_changes,"Average_Acceleration__agg_autocorrelation__f_agg_""mean""__maxlag_40","Average_Acceleration__agg_autocorrelation__f_agg_""median""__maxlag_40","Average_Acceleration__agg_autocorrelation__f_agg_""var""__maxlag_40","Average_Acceleration__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","Average_Acceleration__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","Average_Acceleration__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","Average_Acceleration__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","Average_Acceleration__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,y__symmetry_looking__r_0.9,y__symmetry_looking__r_0.9500000000000001,y__time_reversal_asymmetry_statistic__lag_1,y__time_reversal_asymmetry_statistic__lag_2,y__time_reversal_asymmetry_statistic__lag_3,y__value_count__value_-1,y__value_count__value_0,y__value_count__value_1,y__variance,y__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
312,1.736188,10.25868,0.007281,-0.014531,0.050725,-0.008054,0.329386,0.000797,0.000232,-0.037398,...,1.0,1.0,-1976.747469,-4425.838751,-7393.656099,0.0,0.0,0.0,14719.846366,1.0
511,2.204457,10.833218,-0.003631,-0.057421,0.075008,-0.006289,0.400382,0.000751,0.000175,-0.021057,...,1.0,1.0,-91313.664277,-183730.508764,-277464.990358,0.0,0.0,0.0,19365.815069,1.0
607,1.810427,12.471187,-0.019192,-0.062168,0.043575,-0.001195,0.385191,0.000812,0.000196,-0.006601,...,1.0,1.0,-54896.239469,-110989.340449,-168028.897833,0.0,0.0,0.0,17045.699491,1.0
811,1.207602,13.666506,0.01049,-0.025293,0.043619,0.010634,0.451143,0.000497,9.9e-05,0.020157,...,1.0,1.0,-54660.302468,-110012.297119,-166148.556109,0.0,0.0,0.0,19168.711097,1.0
905,3.127378,13.880315,-0.014344,-0.012543,0.040236,-0.007554,0.301765,0.000967,0.000309,-0.066997,...,1.0,1.0,-114928.3928,-231110.694731,-348445.472764,0.0,0.0,0.0,23246.371565,1.0


In [None]:
# Optional, Save to HDD-
extracted_features.to_csv("Complete_Dataset-fish.csv", index = False)