## Group feature extraction

In [1]:
import movekit as mkit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [12]:
path = "./datasets/fish-5-features.csv"
data = mkit.read_data(path)
data.head()

Unnamed: 0,time,animal_id,x,y,distance,average_speed,average_acceleration,direction,stopped,turning
0,1,312,405.29,417.76,0.0,0.210217,-0.018039,0.0,1,0.0
1000,1,511,369.99,428.78,0.0,0.020944,0.000236,0.0,1,0.0
2000,1,607,390.33,405.89,0.0,0.070235,0.004961,0.0,1,0.0
3000,1,811,445.15,411.94,0.0,0.3705,0.017482,0.0,1,0.0
4000,1,905,366.06,451.76,0.0,0.118,-0.006333,0.0,1,0.0


### Detecting outliers
Function performs detection of outliers, based on the KNN algorithm: user can define the regarding features for the detection, the number of the nearest neighbors taken into account for the outlier classification, the metric to calculate the distance, the method to aggregate the different distances, and the share of outliers.

In [23]:
# Detect outliers based on KNN.
# mkit.outlier_detection(dataset, features=["distance", "average_speed", "average_acceleration", "direction",
# "stopped"], contamination=0.01, n_neighbors=5, method="mean", metric="minkowski")
outs = mkit.outlier_detection(data)
# printing all rows where outliers are present
outs[outs.loc[:,"outlier"] == 1].head()

Unnamed: 0,time,animal_id,outlier,x,y,distance,average_speed,average_acceleration,direction,stopped,turning
2479,480,607,1,60.63,401.17,2.418677,2.503849,0.07379,-97.125016,0,2.81784
2481,482,607,1,60.32,396.22,2.522856,2.479927,0.056199,-92.726311,0,1.74451
1877,878,511,1,511.8,69.58,4.623505,4.293873,0.241837,2.231175,0,3.231872
1878,879,511,1,516.29,70.07,4.516658,4.303061,0.187692,6.228122,0,3.996947
1881,882,511,1,528.46,72.19,4.0224,4.136429,0.040597,11.61722,0,1.925073


In [22]:
# same function, different parameters
other_outs = mkit.outlier_detection(dataset = data, features = ["average_speed", "average_acceleration"], contamination = 0.05, n_neighbors = 8, method = "median", metric = "euclidean")

# printing all rows where outliers are present
other_outs[other_outs.loc[:,"outlier"] == 1].head()

Unnamed: 0,time,animal_id,outlier,x,y,distance,average_speed,average_acceleration,direction,stopped,turning
2324,325,607,1,126.9,410.58,1.05,1.296308,-0.224662,180.0,0,357.814199
2325,326,607,1,126.04,410.56,0.860233,1.065177,-0.244628,-178.66778,0,-358.66778
2326,327,607,1,125.45,410.65,0.596825,0.876628,-0.2642,171.326826,0,349.994606
2327,328,607,1,124.93,410.77,0.533667,0.738006,-0.26343,167.005383,0,-4.321443
365,366,312,1,257.86,403.82,2.462458,2.763925,-0.176182,177.439699,0,-1.117906


### Group-level Analysis

Below we perform Analysis on Group-Level. This consists of:
- Group-Level averages,
- Centroid Medoid computation
- A dynamic time warping matrix, 
- A clustering over time based on absolute features,
- The centroid direction,
- The heading difference of each animal with respect to the current centroid
- The group - polarization for each timestep. 

#### Obtain group-level records for each point in time
Records consist of total group-distance covered, mean speed, mean acceleration and mean distance from centroid for each timestamp. If input doesn't contain centroid or feature data, it is calculated, showing a warning.

In [15]:
group_data = mkit.group_movement(data)
group_data.head()

Calculating centroid distances: 100%|██████████| 1000/1000 [00:05<00:00, 195.10it/s]


Unnamed: 0_level_0,total_dist,mean_speed,mean_acceleration,mean_distance_centroid
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,0.157979,-0.000339,29.4616
2,1.174908,0.157641,-0.000339,29.585
3,1.025155,0.15561,-0.000339,29.6914
4,0.91896,0.153579,-0.000339,29.7782
5,0.830461,0.153341,-0.000339,29.8518


#### Obtain centroid, medoid and distance to centroid for each movement record

In [16]:
movement = mkit.centroid_medoid_computation(data, object_output = False)
movement.head()

Calculating centroid distances: 100%|██████████| 1000/1000 [00:06<00:00, 150.10it/s]


Unnamed: 0,time,animal_id,outlier,x,y,distance,average_speed,average_acceleration,direction,stopped,turning,x_centroid,y_centroid,medoid,distance_to_centroid
0,1,312,0,405.29,417.76,0.0,0.210217,-0.018039,0.0,1,0.0,395.364,423.226,312,11.331
1,2,312,0,405.31,417.37,0.390512,0.192177,-0.018039,-87.064327,1,0.0,395.382,423.22,312,11.523
2,3,312,0,405.31,417.07,0.3,0.174723,-0.018039,-90.0,1,-2.935673,395.392,423.234,312,11.677
3,4,312,0,405.3,416.86,0.210238,0.159133,-0.018039,-92.726311,1,-2.726311,395.396,423.272,312,11.798
4,5,312,0,405.29,416.71,0.150333,0.155506,-0.018039,-93.814075,1,-1.087764,395.394,423.324,312,11.903


#### Get the heading difference between centroids and animal's direction
Stronger gain in y gives positive difference, weaker gain in y gives negative difference, since constant y is defined to be 0 degrees.

In [17]:
centroid_dir = mkit.compute_centroid_direction(data).sort_values(['time','animal_id'])
heading_diff = mkit.get_heading_difference(data)
heading_diff.head()

Calculating centroid distances: 100%|██████████| 1000/1000 [00:03<00:00, 306.85it/s]
Computing centroid direction: 100%|██████████| 100.0/100 [00:00<00:00, 964.31it/s]
Calculating centroid distances: 100%|██████████| 1000/1000 [00:05<00:00, 179.88it/s]
Calculating heading difference: 100%|██████████| 100.0/100 [00:00<00:00, 933.70it/s]


Unnamed: 0,time,animal_id,outlier,x,y,distance,average_speed,average_acceleration,direction,stopped,turning,x_centroid,y_centroid,medoid,distance_to_centroid,centroid_direction,heading_difference
0,1,312,0,405.29,417.76,0.0,0.210217,-0.018039,0.0,1,0.0,395.364,423.226,312,11.331,0.0,0.0
1,2,312,0,405.31,417.37,0.390512,0.192177,-0.018039,-87.064327,1,0.0,395.382,423.22,312,11.523,0.001706,-87.066033
2,3,312,0,405.31,417.07,0.3,0.174723,-0.018039,-90.0,1,-2.935673,395.392,423.234,312,11.677,0.000223,-90.000223
3,4,312,0,405.3,416.86,0.210238,0.159133,-0.018039,-92.726311,1,-2.726311,395.396,423.272,312,11.798,0.002277,-92.728588
4,5,312,0,405.29,416.71,0.150333,0.155506,-0.018039,-93.814075,1,-1.087764,395.394,423.324,312,11.903,0.003656,-93.81773


#### Obtain a matrix, based on dynamic time warping
Each Animal-ID is displayed in the indices, the entries reflect the similarity of the animal's trajectories based on the DTW algorithm.

In [10]:
#Obtain dynamic time warping amongst all trajectories from the animals. The lower the value for two animals is, the more similar their trajectories are based on the DTW algorithm.
#mkit.dtw_matrix(preprocessed_data, path=False, distance=euclidean)
#preprocessed_data: DataFrame containing the movement data.
#path: Boolean to specify if matrix of dtw-path gets returned as well. (the warping path for all the sequence pairs which are examined)
#distance: Specify with distance measure to use. Default: "euclidean". Other example alternatives are pdist or minkowski. (all distances defined by fastdtw package are possible.

mkit.dtw_matrix(data)

Calculating dynamic time warping: 100%|██████████| 5/5 [00:04<00:00,  1.10it/s]


Unnamed: 0,312,511,607,811,905
312,0.0,30843.085403,32859.600139,42461.524553,37916.447829
511,30843.085403,0.0,26931.014323,47116.708116,20967.960073
607,32862.955351,26931.014323,0.0,39859.787924,35711.718898
811,42461.524553,47116.708116,39859.787924,0.0,38379.806433
905,37916.447829,20968.954941,35711.718898,38379.806433,0.0
