### TO DO: 
- Gotta add Pitch and Roll. Also keep in mind that you cannot just remove NAs like you did, because you are going to break these 6.4s time-series. Update: removed the drop_na() statement from the data_description_wrangling_bp.Rmd and included `pitch.angle` and `roll.angle`



In [52]:
import pandas as pd
from pyprojroot.here import here
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
def get_unique_values(df: pd.DataFrame) -> dict:
    """
    Get unique values in each column of a DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to process.

    Returns:
    - dict: A dictionary with column names as keys and unique values as values.
    """
    unique_values = {col: df[col].unique() for col in df.columns}
    return unique_values


In [3]:
# import cleaned data
cleaned_data = pd.read_csv(here('data/clean_sheep_data_2019.csv'))

get_unique_values(cleaned_data)


{'sheep_name': array(['ov1.', 'ov1b', 'ov5.', 'ov6.', 'ov6b', 'ov9.', 'ov10', 'ov11',
        'ov18', 'ov20', 'ov23', 'ov24', 'ov31', 'ov33', 'ov36', 'ov38',
        'ov40', 'ov44', 'ov48', 'ov51', 'ov54', 'ov58', 'ov60'],
       dtype=object),
 'sheep_number': array([ 1,  5,  6,  9, 10, 11, 18, 20, 23, 24, 31, 33, 36, 38, 40, 44, 48,
        51, 54, 58, 60]),
 'year': array([2019]),
 'month': array([ 9, 12]),
 'day': array([24, 17]),
 'hours': array([14, 15, 11, 12, 13]),
 'minutes': array([53, 54, 55, 56, 25, 26, 27,  6,  7,  8, 28, 29, 30, 35, 36, 37, 38,
        24, 18, 19, 20, 21, 22, 23, 58, 59,  0, 31, 32, 33, 52, 57,  9, 10,
         3,  4,  5, 41, 42, 11, 12, 13, 44, 45, 46, 47, 48, 49,  1, 50,  2,
        14, 15, 16, 17, 51, 39, 40, 43, 34]),
 'seconds': array([57, 58, 59,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
        14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
        31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47

In [4]:
#TO DO: edit description and name after adding the merging of sheep names too 
def filter_and_merge_behaviours(df: pd.DataFrame, behaviours_to_keep: list) -> pd.DataFrame:
    """
    This function takes a DataFrame and:
    1. Merges rows with behaviours 'fast_walk' into 'walk'.
    2. Filters out rows with behaviours not included in the behaviours_to_keep list.
    
    Parameters:
    - df: A pandas DataFrame containing a 'behaviours' column
    - behaviours_to_keep: A list of behaviours to retain in the DataFrame
    
    Returns:
    - A modified DataFrame with 'fast_walk' behaviours changed to 'walk' and
      other behaviours filtered out based on the behaviours_to_keep list
    """

    # Replace 'fast_walk' with 'walk' in the 'behaviours' column
    df['behaviours'] = df['behaviours'].replace('fast_walk', 'walk')
    # Replace 'ov*b' with 'ov*'
    df['sheep_name'] = df['sheep_name'].replace('ov1b', 'ov1.')
    df['sheep_name'] = df['sheep_name'].replace('ov6b', 'ov6.')
    
    # Filter the DataFrame based on the behaviours_to_keep list
    df = df[df['behaviours'].isin(behaviours_to_keep)]
    
    return df

# Example usage: Keeping only 'walk' and 'resting' behaviours in the DataFrame
behaviours_to_keep = ['eating', 'walk', 'resting']
filtered_data = filter_and_merge_behaviours(cleaned_data, behaviours_to_keep)

get_unique_values(filtered_data)


{'sheep_name': array(['ov1.', 'ov5.', 'ov6.', 'ov9.', 'ov10', 'ov11', 'ov18', 'ov20',
        'ov23', 'ov24', 'ov31', 'ov33', 'ov36', 'ov38', 'ov40', 'ov44',
        'ov48', 'ov51', 'ov54', 'ov58', 'ov60'], dtype=object),
 'sheep_number': array([ 1,  5,  6,  9, 10, 11, 18, 20, 23, 24, 31, 33, 36, 38, 40, 44, 48,
        51, 54, 58, 60]),
 'year': array([2019]),
 'month': array([ 9, 12]),
 'day': array([24, 17]),
 'hours': array([14, 15, 11, 12, 13]),
 'minutes': array([54, 55, 25, 26, 27,  6,  7,  8, 28, 29, 30, 37, 38, 24, 18, 19, 20,
        21, 22, 23, 58, 59,  0, 31, 32, 33, 52, 53, 56, 57,  9, 10,  3, 41,
        42, 11, 12, 44, 45, 46, 47, 48, 49,  1, 50,  2,  4,  5, 13, 14, 15,
        16, 17, 39, 40, 43, 34, 35, 36, 51]),
 'seconds': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 18, 19, 20, 21, 27,
        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
        45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 13, 14, 15, 16,
        17, 22, 23, 2

In [6]:
filtered_data, get_unique_values(filtered_data)

(       sheep_name  sheep_number  year  month  day  hours  minutes  seconds  \
 159          ov1.             1  2019      9   24     14       54        1   
 160          ov1.             1  2019      9   24     14       54        1   
 161          ov1.             1  2019      9   24     14       54        1   
 162          ov1.             1  2019      9   24     14       54        1   
 163          ov1.             1  2019      9   24     14       54        1   
 ...           ...           ...   ...    ...  ...    ...      ...      ...   
 849910       ov60            60  2019      9   24     15       45       59   
 849911       ov60            60  2019      9   24     15       45       59   
 849912       ov60            60  2019      9   24     15       45       59   
 849913       ov60            60  2019      9   24     15       45       59   
 849914       ov60            60  2019      9   24     15       45       59   
 
         event.no.     acc_x  ...     mag_y     ma

In [11]:
type(filtered_data)

pandas.core.frame.DataFrame

In [14]:

sampled_data = filtered_data[filtered_data['sheep_name'].isin(['ov1.', 'ov60'])]


sampled_data.to_csv(here('data/sampled_data.csv'))

Notes on the dataset: 

buenas,

tengo una duda sobre los datos de ovejas de pilca. El paper de sofi de los HMM dice:

"Fieldwork was conducted at the “Campo Anexo Pilcaniyeu” from INTA (National Institute of Agricultural Technology) Bariloche, Patagonia Argentina. The data were collected from 25 different sheep during September and December of 2019."

y mi pregunta es: estas ovejas que se filmaron en septiembre y diciembre del 2019, son las mismas ovejas de las cuales hay 1 año de datos que todavía no se puede decodificar? o son otras ovejas?


abrazo


y si son las mismas ovejas, cuáles son las fechas en las que se pusieron y sacaron los collares en este año de datos recolectados?


son los mismos animales

los collares se pusieron en febrero del 2019 y se retiraron en noviembre creo

las filmaciones de diciembre del 2019 son con los mismos animales pero con aparatos puestos ese mismo día.



From kapoor and Narayanan 2022: 
"
Data leakage is a spurious relationship between the independent variables and the target variable that arises as an artifact of the data collection, sampling, or pre-processing strategy. Since the spurious relationship won’t be present in the distribution about which scientific claims are made, leakage usually leads to inflated estimates of model performance.

[...]

In particular, we find that leakage can result from a difference between the distribution of the test set and the distribution of scientific interest (Section 2.4)


"

## Taxonomy of data leakage
[L1] Lack of clean separation of training and test dataset. If the training dataset is not separated from the test dataset during all pre-processing, modeling and evaluation steps, the model has access to information in the test set before its performance is evaluated. Since the model has access to information from the test set at training time, the model learns relationships between the predictors and the outcome that would not be available in additional data drawn from the distribution of interest. The performance of the model on this data therefore does not reflect how well the model would perform on a new test set drawn from the same distribution of data.



[L1.1] No test set...
m'not our case'

[L1.2] Pre-processing on training and test set. Using the entire dataset for any pre-processing steps such as imputation or over/under sampling. For instance, using oversampling before splitting the data into training and test sets leads to an imperfect separation between the training and test sets since data generated using oversampling from the training set will also be present in the test set.
m'gotta think about this over/under sampling, are we going to use it? same thing on imputation. We might use weighted loss functions, which seem better in avoiding problems that arise with over/under sampling'


[L1.3] Feature selection on training and test set. Feature selection on the entire dataset results in using information about which feature performs well on the test set to make a decision about which features should be included in the model.
m'not important because we are not doing feature selection?'

[L1.4] Duplicates in datasets. If a dataset with duplicates is used for the purposes of training and evaluating an ML model, the same data could exist in the training as well as test set.
m'we don't have duplicates'


[L2] Model uses features that are not legitimate. If the model has access to features that should not be legitimately available for use in the modeling exercise, this could result in leakage. One instance when this can happen is if a feature is a proxy for the outcome variable (Kaufman et al., 2012). For example, Filho et al. (2021) find that a recent study included the use of anti-hypertensive drugs as a feature for predicting hypertension. Such a feature could lead to leakage because the model would not have access to this information when predicting the health outcome for a new patient. Further, if the fact that a patient uses anti-hypertensive drugs is already known at prediction time, the prediction of hypertension becomes a trivial task.
m'not our case, our features (acc and mag) are legitimate'

[L3] Test set is not drawn from the distribution of scientific interest. The distribution of data on which the performance of an ML model is evaluated differs from the distribution of data about which the scientific claims are made. The performance of the model on the test set does not correspond to its performance on data drawn from the distribution of scientific interest. 
m'our distribution of scientific interest is the behaviour of the sheep we recorded, but 
m'our distribution of scientific interest is the behaviour of the Pilca sheep recorded with collars from February 2019 to December 2019, and the annotated data is of september and december. So I guess we don't have to make a train-test split that leaves some individuals on one split and other individuals in another split, and we don't have to make a temporal split either.
Now, if we want to expand the distribution of scientific interest to other sheep, we should do this split that puts some individuals in one side and other individuals in another side. 

If we add the new data from Rio Manso, we have to rethink how to make the splits. 
'

[L3.1] Temporal leakage. When an ML model is used to make predictions about a future outcome of interest, the test set should not contain any data from a date before the training set. If the test set contains data from before the training set, the model is built using data “from the future” that it should not have access to during training, and can cause leakage.
m' I think this is not our case, we are not trying to make a forecast'


[L3.2] Nonindependence between train and test samples. Nonindependence between train and test samples constitutes leakage, unless the scientific claim is about a distribution that has the same dependence structure. In the extreme (but unfortunately common) case, train and test samples come from the same people or units. For example, Oner et al. (2020) find that a recent study on histopathology uses different observations of the same patient in the training and test sets. In this case, the scientific claim is being made about the ability to predict gene mutations in new patients; however, it is evaluated on data from old patients (i.e., data from patients in the training set), leading to a mismatch between the test set distribution and the scientific claim. The train-test split should account for the dependencies in the data to ensure correct performance evaluation. Methods such as ‘block cross validation’ can partition the dataset strategically so that the performance evaluation does not suffer from data leakage and overoptimism (Roberts et al., 2017; Valavi et al., 2021). Handling nonindependence between the training and test sets in general—i.e., without any assumptions about independence in the data—is a hard problem, since we might not know the underlying dependency structure of the task in many cases (Malik, 2020).
m'if we don't make the split putting different individuals in different sides, we cannot make the claim that the model can classify behaviours of other sheep, so the model will only be useful for our data. I guess in this case, where a model is only useful within a dataset, the paper would show how to build such model for other researchers to do the same, but it cannot claim that it's a model that other researchers can use' 


[L3.3] Sampling bias in test distribution. Sampling bias in the choice of test dataset can lead to data leakage. One example of sampling bias is spatial bias, which refers to choosing the test data from a geographic location but making claims about model performance in other geographic locations as well. Another example is selection bias, which entails choosing a non-representative subset of the dataset for evaluation. For example, Bone et al. (2015) highlight that in a study on predicting autism using ML models, excluding the data corresponding to borderline cases of autism leads to leakage since the test set is no longer representative of the general population about which claims are made. In addition, borderline cases of autism are often the most tricky to diagnose, so excluding them the evaluation set is likely to lead to overoptimistic results. Cases of leakage due to sampling bias can often be subtle. For example, Zech et al. (2018) find that models for pneumonia prediction trained on images from one hospital do not generalize to images from another hospital due to subtle differences in how images are generated in each hospital.
m'we should make sure all the behaviours appear in both training and test datasets.'



'A model may have leakage when the distribution about which the scientific claim is made does not match the distribution from which the evaluation set is drawn. ML models may also suffer from a related, but distinct limitation: the lack of generalization when we try to apply a result about one population to another similar but distinct population. [...]
Robustness to distribution shift is an ongoing area of work in ML methods research. Even slight shifts in the target distribution can cause performance estimates to change drastically (Recht et al., 2019). Despite ongoing work to create ML methods that are robust to distribution shift, best practices to deal with distribution shift currently include testing the ML models on the data from the distribution we want to make claims about (Geirhos et al., 2020). In ML-based science, where the aim is to create generalizable knowledge, we should take results that claim to generalize to a different population from the one models were evaluated on with caution.





CONTINUE READING HERE: A model may have leakage when the distribution


also read this: https://reproducible.cs.princeton.edu/
M' THIS ONE IS IMPORTANT BECAUSE AT THE END GOAL WE WANT TO USE IT IN DIFFERENT SHEEP'







DATA QUALITY STUFF: 
"Data quality issues. Access to good quality data is essential for creating ML models (Paullada et al., 2020; Scheuerman et al., 2021). Issues with the quality of the dataset could affect the results of ML-based science. 10 papers in our survey highlighted data quality issues such as not addressing missing values in the data, the small size of datasets compared to the number of predictors, and the outcome variable being a poor proxy for the phenomenon being studied."
m' should we address the missing values or just remove them?'


METRIC CHOICE ISSUES:
"Metric choice issues. A mismatch between the metric used to evaluate performance and the scientific problem of interest leads to issues with performance claims. For example, using accuracy as the evaluation metric with a heavily imbalanced dataset leads to overoptimistic results, since the model can get a high accuracy score by always predicting the majority class. Four papers in our survey highlighted metric choice issues."
m' this can be a problem for us'

ARGUMENTS TO PRESENT

"[L1] Clean train-test separation. The researcher needs to argue why the test set does not interact with training data during any of the preprocessing, modeling or evaluation steps to ensure a clean train-test separation."

"[L2] Each feature in the model is legitimate. The researcher needs to argue why each feature used in their model is legitimate, i.e., a claim made using each feature is of scientific interest. Note that some models might use hundreds of features. In such cases, it is even more important to reason about the correctness of the features used, since the incorrect use of a single feature in the model can cause leakage. That said, the same argument for why a feature is legitimate can often apply to a whole set of features. For example, for a study using individuals’ location history as a feature vector, the use of the entire vector can be justified together. Note that we do not ask for the researcher to list each feature used in their model. Rather, we ask that the justification provided for the legitimacy of the features used in their model should cover every feature used in their model."

"[L3] Test set is drawn from the distribution of scientific interest. If the distribution about which the scientific claims are made is different from the one on which the model is tested, then any claims about the performance of an ML model on the evaluation step fall short. The researcher needs to justify that the test set is drawn from the distribution of scientific interest and there is no selection or sampling bias in the data collection process. This step can help clarify the distribution about which scientific claims are being made and detect temporal leakage."




ChatGPT on data imputation and under/over sampling: Certainly! In the context of your models (both the CNN and the CNN+Transformer), preprocessing is an essential step to ensure that the models can learn effectively from the data. Here, I'll explain two common preprocessing techniques: imputation and over/under-sampling.
1. Imputation

    Explanation: Imputation refers to the process of filling missing values in your dataset. Missing values can occur for various reasons, such as sensor malfunctions or data collection errors. In the context of your models, missing values can be problematic as they might lead to training errors or less effective learning.
    Application: If your animal movement data has occasional missing values (for instance, some accelerometer or magnetometer readings are missing), you'll need to estimate those missing values. Various imputation methods can be employed, such as mean imputation, linear interpolation, or more sophisticated techniques like k-NN imputation or model-based imputation.
    Consideration: The imputation strategy should be chosen based on the nature and pattern of the missing data, and the underlying assumptions about why data is missing. Also, it’s important to evaluate the imputed data’s impact on the model's performance.

2. Over/Under Sampling

    Explanation: Over/under-sampling is used to address class imbalance issues where one or more classes are represented more than others in the training data. It can lead to a biased model that performs poorly on underrepresented classes.
    Application: In your case, if certain behaviors (like “Resting”) are underrepresented in your dataset, you might consider oversampling these classes (creating copies or augmentations of existing samples) or undersampling the overrepresented classes (randomly removing samples) to balance the dataset.
    Consideration: Care should be taken with these methods as oversampling can lead to overfitting, while undersampling can lead to loss of information. Alternative approaches include using weighted loss functions to assign higher importance to underrepresented classes during training.

Specific to Your Models:

    CNN Model:
        Ensure that each segment fed into the model is complete and has consistent dimensions. If any segment has missing values, consider using imputation to fill those gaps.
        If class imbalances exist (some behaviors are more common in the dataset), consider applying over/under-sampling or using class weights during training to mitigate the imbalance.

    CNN + Transformer Model:
        Similar preprocessing steps as the CNN model apply. Additionally, since Transformers are sensitive to the input data's scale and distribution, consider normalizing or standardizing the feature values if needed.
        Given that Transformers can capture complex patterns, be extra cautious with imputation and class balancing to avoid introducing biases or artifacts that the model might exploit.

Final Thoughts:

Preprocessing should be done carefully, considering the data's nature and the models' characteristics. It’s often beneficial to experiment with different preprocessing strategies and evaluate their impacts on model performance systematically. Always validate the effectiveness of preprocessing steps using cross-validation or hold-out validation sets to ensure they generalize well to unseen data.Certainly! In the context of your models (both the CNN and the CNN+Transformer), preprocessing is an essential step to ensure that the models can learn effectively from the data. Here, I'll explain two common preprocessing techniques: imputation and over/under-sampling.
1. Imputation

    Explanation: Imputation refers to the process of filling missing values in your dataset. Missing values can occur for various reasons, such as sensor malfunctions or data collection errors. In the context of your models, missing values can be problematic as they might lead to training errors or less effective learning.
    Application: If your animal movement data has occasional missing values (for instance, some accelerometer or magnetometer readings are missing), you'll need to estimate those missing values. Various imputation methods can be employed, such as mean imputation, linear interpolation, or more sophisticated techniques like k-NN imputation or model-based imputation.
    Consideration: The imputation strategy should be chosen based on the nature and pattern of the missing data, and the underlying assumptions about why data is missing. Also, it’s important to evaluate the imputed data’s impact on the model's performance.

2. Over/Under Sampling

    Explanation: Over/under-sampling is used to address class imbalance issues where one or more classes are represented more than others in the training data. It can lead to a biased model that performs poorly on underrepresented classes.
    Application: In your case, if certain behaviors (like “Resting”) are underrepresented in your dataset, you might consider oversampling these classes (creating copies or augmentations of existing samples) or undersampling the overrepresented classes (randomly removing samples) to balance the dataset.
    Consideration: Care should be taken with these methods as oversampling can lead to overfitting, while undersampling can lead to loss of information. Alternative approaches include using weighted loss functions to assign higher importance to underrepresented classes during training.

Specific to Your Models:

    CNN Model:
        Ensure that each segment fed into the model is complete and has consistent dimensions. If any segment has missing values, consider using imputation to fill those gaps.
        If class imbalances exist (some behaviors are more common in the dataset), consider applying over/under-sampling or using class weights during training to mitigate the imbalance.

    CNN + Transformer Model:
        Similar preprocessing steps as the CNN model apply. Additionally, since Transformers are sensitive to the input data's scale and distribution, consider normalizing or standardizing the feature values if needed.
        Given that Transformers can capture complex patterns, be extra cautious with imputation and class balancing to avoid introducing biases or artifacts that the model might exploit.

Final Thoughts:


Preprocessing should be done carefully, considering the data's nature and the models' characteristics. It’s often beneficial to experiment with different preprocessing strategies and evaluate their impacts on model performance systematically. Always validate the effectiveness of preprocessing steps using cross-validation or hold-out validation sets to ensure they generalize well to unseen data.


m'Can you tell me more about weighted loss functions?'

Certainly! A weighted loss function is used in training machine learning models to handle imbalanced datasets or give different importances to different classes or samples. This technique helps prevent the model from being biased towards the overrepresented class(es).
Types of Weighting in Loss Functions:

    Class Weighting:
        Explanation: Different weights are assigned to different classes. The loss for each class will be scaled by its assigned weight. If a class is underrepresented, giving it a higher weight can help the model pay more attention to it during training.
        Application: In binary and multi-class classification problems, like your animal behavior classification task, if one behavior (e.g., "Walking") is significantly more common in the training data than others (e.g., "Eating," "Resting"), you can assign higher weights to the underrepresented classes to balance the training process.

    Sample Weighting:
        Explanation: Individual samples are assigned different weights. This can be useful if some samples are more important or reliable than others.
        Application: It's less common but can be used in scenarios where, for example, certain data points are known to be labeled with high confidence, or some data points are more critical to get right for the application in use.

Implementing Weighted Loss Function in Keras:

For your specific case of a multi-class classification problem, you can assign weights to different classes using the class_weight parameter during the training. Here's a simplified example:

python

# Assume class_weights is a dictionary that maps class indices to the weights you want to assign to each class
class_weights = {0: 1.0,  # "Walking"
                 1: 2.0,  # "Eating"
                 2: 3.0}  # "Resting"

model.fit(x_train, y_train, epochs=10, batch_size=32, class_weight=class_weights)

In this example:

    The class "Walking" is given a weight of 1.0, meaning it’s treated "normally" during training.
    The class "Eating" is given a higher weight of 2.0, indicating that the loss for this class will have twice the effect compared to "Walking."
    The class "Resting" gets the highest weight of 3.0.

Considerations:

    Choosing Weights:
        It’s crucial to select appropriate weights. You can start with the inverse of the class frequencies as weights, but it might require tuning.
        Avoid excessively high weights for the minority class, as it can lead the model to focus too much on those and perform poorly on the majority class.

    Evaluation Metrics:
        Monitor various evaluation metrics during training to ensure that the model isn’t just improving the loss by focusing on one class. Metrics like precision, recall, and F1-score for each class can be particularly informative.

    Experimental Validation:
        Always validate the effectiveness of the chosen class weights using a validation dataset to ensure the model generalizes well and doesn’t overfit the training data.

In the context of your animal behavior classification task, using a weighted loss function can help the model learn to classify behaviors more equitably, especially if some behaviors are less frequent in the dataset. Make sure to monitor the model's performance to ensure it's learning meaningful patterns and not just catering to the weighted classes.Certainly! A weighted loss function is used in training machine learning models to handle imbalanced datasets or give different importances to different classes or samples. This technique helps prevent the model from being biased towards the overrepresented class(es).
Types of Weighting in Loss Functions:

    Class Weighting:
        Explanation: Different weights are assigned to different classes. The loss for each class will be scaled by its assigned weight. If a class is underrepresented, giving it a higher weight can help the model pay more attention to it during training.
        Application: In binary and multi-class classification problems, like your animal behavior classification task, if one behavior (e.g., "Walking") is significantly more common in the training data than others (e.g., "Eating," "Resting"), you can assign higher weights to the underrepresented classes to balance the training process.

    Sample Weighting:
        Explanation: Individual samples are assigned different weights. This can be useful if some samples are more important or reliable than others.
        Application: It's less common but can be used in scenarios where, for example, certain data points are known to be labeled with high confidence, or some data points are more critical to get right for the application in use.

Implementing Weighted Loss Function in Keras:

For your specific case of a multi-class classification problem, you can assign weights to different classes using the class_weight parameter during the training. Here's a simplified example:

python

# Assume class_weights is a dictionary that maps class indices to the weights you want to assign to each class
class_weights = {0: 1.0,  # "Walking"
                 1: 2.0,  # "Eating"
                 2: 3.0}  # "Resting"

model.fit(x_train, y_train, epochs=10, batch_size=32, class_weight=class_weights)

In this example:

    The class "Walking" is given a weight of 1.0, meaning it’s treated "normally" during training.
    The class "Eating" is given a higher weight of 2.0, indicating that the loss for this class will have twice the effect compared to "Walking."
    The class "Resting" gets the highest weight of 3.0.

Considerations:

    Choosing Weights:
        It’s crucial to select appropriate weights. You can start with the inverse of the class frequencies as weights, but it might require tuning.
        Avoid excessively high weights for the minority class, as it can lead the model to focus too much on those and perform poorly on the majority class.

    Evaluation Metrics:
        Monitor various evaluation metrics during training to ensure that the model isn’t just improving the loss by focusing on one class. Metrics like precision, recall, and F1-score for each class can be particularly informative.

    Experimental Validation:
        Always validate the effectiveness of the chosen class weights using a validation dataset to ensure the model generalizes well and doesn’t overfit the training data.

In the context of your animal behavior classification task, using a weighted loss function can help the model learn to classify behaviors more equitably, especially if some behaviors are less frequent in the dataset. Make sure to monitor the model's performance to ensure it's learning meaningful patterns and not just catering to the weighted classes.

Description of data preprocessing: 

CASE 1. Just decoding our dataset
8 features will be used (acc_*. mag_*, pitch, and roll) in the deep learning model. 3 behaviours will be classified (resting, walk, eating). Behaviours like `eating_down` and `eating_up` will be clumped into a single behaviour named `eating`. Behaviours like `walk` and `fast_walk` will be clumped into the single behaviour `walking`. Behaviour `vigilance` will be ignored for now. 
We will do a 80/20 training/testing split. Since we want the training and testing splits to have the same amount of data of each behaviour and each sheep, we will first split the dataframe into sub-dataframes that only contain a given behaviour and then split into further sub-dataframes per sheep. Then we will do further splits to get continuous segments of 6.4s (that is, 6.4 * 40 rows because it's 40Hz data) of data (some sort of short 6.4s time-series of a sheep)  Then will do the 80/20 train/test splits and merge back all the sub-datasets. Once we have the train/test splits, we will perform feature minmax normalization into each split separately to avoid data leakage.   
Notes on train/test splits: If we simply


It's not okay to first do a per behaviour split, because if we do that we will end up mixing rows that do not correspond to the 6.4s "time series". 



In [8]:
get_unique_values(filtered_df)

{'sheep_name': array(['ov1.', 'ov5.', 'ov6.', 'ov9.', 'ov10', 'ov11', 'ov18', 'ov20',
        'ov23', 'ov24', 'ov31', 'ov33', 'ov36', 'ov38', 'ov40', 'ov44',
        'ov48', 'ov51', 'ov54', 'ov58', 'ov60'], dtype=object),
 'sheep_number': array([ 1,  5,  6,  9, 10, 11, 18, 20, 23, 24, 31, 33, 36, 38, 40, 44, 48,
        51, 54, 58, 60]),
 'year': array([2019]),
 'month': array([ 9, 12]),
 'day': array([24, 17]),
 'hours': array([14, 15, 11, 12, 13]),
 'minutes': array([54, 55, 25, 26, 27,  6,  7,  8, 28, 29, 30, 37, 38, 24, 18, 19, 20,
        21, 22, 23, 58, 59,  0, 31, 32, 33, 52, 53, 56, 57,  9, 10,  3, 41,
        42, 44, 45, 46, 47, 48, 49,  1, 11, 50,  2,  4,  5, 12, 13, 14, 15,
        16, 17, 39, 40, 43, 34, 35, 36, 51]),
 'seconds': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 18, 19, 20, 21, 27,
        28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
        45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 13, 14, 15, 16,
        17, 22, 23, 2

In [25]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 617216 entries, 159 to 798652
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   sheep_name          617216 non-null  object 
 1   sheep_number        617216 non-null  int64  
 2   year                617216 non-null  int64  
 3   month               617216 non-null  int64  
 4   day                 617216 non-null  int64  
 5   hours               617216 non-null  int64  
 6   minutes             617216 non-null  int64  
 7   seconds             617216 non-null  int64  
 8   event.no.           617216 non-null  int64  
 9   acc_x               617216 non-null  float64
 10  acc_y               617216 non-null  float64
 11  acc_z               617216 non-null  float64
 12  mag_x               617216 non-null  float64
 13  mag_y               617216 non-null  float64
 14  mag_z               617216 non-null  float64
 15  behaviours          617216 non-null  

In [9]:
def prepare_data_for_model_v2(df: pd.DataFrame, features: list, test_size: float = 0.2) -> (pd.DataFrame, pd.DataFrame):
    """
    This function prepares the data for training a deep learning model based on the steps outlined:
    - Splits data by behaviour and sheep
    - Performs an 80/20 train/test split, ensuring that there are enough samples for the split
    - Applies Min-Max normalization to the specified features
    
    Parameters:
    - df: The original DataFrame
    - features: The list of features to be used in the model and to be normalized
    - test_size: The proportion of the dataset to include in the test split
    
    Returns:
    - train_df: The training DataFrame, with specified features normalized
    - test_df: The testing DataFrame, with specified features normalized
    """
    
    train_dfs = []
    test_dfs = []
    
    # Split DataFrame by unique behaviours
    behaviours = df['behaviours'].unique()
    for behaviour in behaviours:
        behaviour_df = df[df['behaviours'] == behaviour]
        
        # Split by unique sheep
        sheep_names = behaviour_df['sheep_name'].unique()
        for sheep in sheep_names:
            sheep_df = behaviour_df[behaviour_df['sheep_name'] == sheep]
            
            # Check if there are enough samples for a train/test split, otherwise use all data for training
            if len(sheep_df) >= 5:  # Adjust this value as per your requirements
                # Perform 80/20 train/test split
                train, test = train_test_split(sheep_df, test_size=test_size, random_state=42)
                train_dfs.append(train)
                test_dfs.append(test)
            else:
                # Not enough data for a split, use all data for training
                train_dfs.append(sheep_df)
    
    # Merge all train/test DataFrames back together
    train_df = pd.concat(train_dfs, ignore_index=True)
    test_df = pd.concat(test_dfs, ignore_index=True)
    
    # Apply Min-Max normalization to the specified features for train and test DataFrames separately
    scaler = MinMaxScaler()
    train_df[features] = scaler.fit_transform(train_df[features])
    test_df[features] = scaler.transform(test_df[features])  # Use the same scaler object to avoid data leakage
    
    return train_df, test_df

# Apply the updated function and get the train and test DataFrames
train_df, test_df = prepare_data_for_model_v2(df, features)

# Display the first few rows of the train and test DataFrames
train_df.head(), test_df.head()

NameError: name 'df' is not defined

We will create the 6.4s segments of continuous data. Description of this: 

This dataset contains sheep movement data that was recorder during different months of the 2019 year: september (9) and decemer (12), and the data has a temporal structure (the sheep data was recorded sequentially). Each row corresponds to one entry of the `acc_*` variables, which are measured at 40Hz, so there's 40 rows for each second of each `sheep_name`. 
What I need are subsets of this data, that represent 6.4s (40 x 6.4 =  256) of CONTINUOUS data from a given sheep, something like extracting CONTINUOUS time-series fragments of 6.4s of data.

Function written will:

    Filters the dataset based on the specified sheep name and month.
    Divides this filtered data into non-overlapping segments, each containing 256 consecutive rows (equivalent to 6.4 seconds of data).
    Returns a list of DataFrames, each containing one of these segments.



In [26]:
df = filtered_data

In [24]:
# This function does not care about behaviour quality
def extract_time_segments_v1(df, sheep_name, month, segment_size=256):
    """
    Extracts non-overlapping segments of continuous data for a specified sheep and month.
    
    Parameters:
        - df: The DataFrame containing the sheep movement data.
        - sheep_name: The name of the sheep to filter the data.
        - month: The month to filter the data.
        - segment_size: The number of rows in each segment (default is 256 for 6.4s of data at 40Hz).
        
    Returns:
        - A list of DataFrames, each containing a continuous segment of data for the specified sheep and month.
    """
    # Filter the DataFrame based on the sheep_name and month
    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)]

    # Calculate the number of segments
    num_segments = len(filtered_df) // segment_size
    
    # Extract segments
    segments = [filtered_df.iloc[i*segment_size : (i+1)*segment_size] for i in range(num_segments)]
    
    return segments

# Test the function with a specific sheep and month
sample_segments = extract_time_segments_v1(df, 'ov1.', 9)

# Print the total number of extracted segments and display the first segment as an example
len(sample_segments), sample_segments[0] if sample_segments else None


(29,
     sheep_name  sheep_number  year  month  day  hours  minutes  seconds  \
 159       ov1.             1  2019      9   24     14       54        1   
 160       ov1.             1  2019      9   24     14       54        1   
 161       ov1.             1  2019      9   24     14       54        1   
 162       ov1.             1  2019      9   24     14       54        1   
 163       ov1.             1  2019      9   24     14       54        1   
 ..         ...           ...   ...    ...  ...    ...      ...      ...   
 410       ov1.             1  2019      9   24     14       54        7   
 411       ov1.             1  2019      9   24     14       54        7   
 412       ov1.             1  2019      9   24     14       54        7   
 413       ov1.             1  2019      9   24     14       54        7   
 414       ov1.             1  2019      9   24     14       54        7   
 
      event.no.     acc_x  ...     mag_y     mag_z  pitch.angle  roll.angle  \
 1

In [23]:
#lost rows during segment extraction. 
len(filtered_data[(filtered_data['sheep_name'] == 'ov1.') & (filtered_data['month'] == 9)]) - len(sample_segments) * len(sample_segments[0])

118

In [27]:
def extract_time_segments_with_behaviour(df, sheep_name, month, segment_size=256):
    """
    Extracts non-overlapping segments of continuous data for a specified sheep and month,
    ensuring each segment has a consistent behaviour throughout.
    
    Parameters:
        - df: The DataFrame containing the sheep movement data.
        - sheep_name: The name of the sheep to filter the data.
        - month: The month to filter the data.
        - segment_size: The number of rows in each segment (default is 256 for 6.4s of data at 40Hz).
        
    Returns:
        - A list of DataFrames, each containing a continuous segment of data for the specified sheep and month
          with consistent behaviour.
    """
    # Filter the DataFrame based on the sheep_name and month
    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)].reset_index(drop=True)

    segments = []
    i = 0
    
    while i < len(filtered_df) - segment_size:
        segment = filtered_df.iloc[i : i+segment_size]
        
        # Check if the behaviour is consistent throughout the segment
        if len(segment['behaviours'].unique()) == 1:
            segments.append(segment)
            i += segment_size  # Move to the next non-overlapping segment
        else:
            i += 1  # Move by one row and try again
    
    return segments

# Test the updated function with a specific sheep and month
sample_segments_with_behaviour = extract_time_segments_with_behaviour(df, 'ov1.', 9)

# Print the total number of extracted segments and display the first segment as an example
len(sample_segments_with_behaviour), sample_segments_with_behaviour[0] if sample_segments_with_behaviour else None


(25,
     sheep_name  sheep_number  year  month  day  hours  minutes  seconds  \
 0         ov1.             1  2019      9   24     14       54        1   
 1         ov1.             1  2019      9   24     14       54        1   
 2         ov1.             1  2019      9   24     14       54        1   
 3         ov1.             1  2019      9   24     14       54        1   
 4         ov1.             1  2019      9   24     14       54        1   
 ..         ...           ...   ...    ...  ...    ...      ...      ...   
 251       ov1.             1  2019      9   24     14       54        7   
 252       ov1.             1  2019      9   24     14       54        7   
 253       ov1.             1  2019      9   24     14       54        7   
 254       ov1.             1  2019      9   24     14       54        7   
 255       ov1.             1  2019      9   24     14       54        7   
 
      event.no.     acc_x  ...     mag_y     mag_z  pitch.angle  roll.angle  \
 0

In [28]:
def extract_time_segments_with_threshold(df, sheep_name, month, behaviour_threshold=90, segment_size=256):
    """
    Extracts non-overlapping segments of continuous data for a specified sheep and month,
    ensuring each segment has at least a certain percentage of the same behaviour.
    
    Parameters:
        - df: The DataFrame containing the sheep movement data.
        - sheep_name: The name of the sheep to filter the data.
        - month: The month to filter the data.
        - behaviour_threshold: The minimum percentage of rows with the same behaviour in each segment.
        - segment_size: The number of rows in each segment (default is 256 for 6.4s of data at 40Hz).
        
    Returns:
        - A list of DataFrames, each containing a continuous segment of data for the specified sheep and month
          with the behaviour meeting the threshold.
    """
    # Filter the DataFrame based on the sheep_name and month
    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)].reset_index(drop=True)

    segments = []
    i = 0
    
    while i < len(filtered_df) - segment_size:
        segment = filtered_df.iloc[i : i+segment_size]
        
        # Calculate the percentage of the most common behaviour in the segment
        most_common_behaviour_percentage = segment['behaviours'].value_counts(normalize=True).iloc[0] * 100
        
        # Check if the behaviour meets the threshold
        if most_common_behaviour_percentage >= behaviour_threshold:
            segments.append(segment)
            i += segment_size  # Move to the next non-overlapping segment
        else:
            i += 1  # Move by one row and try again
    
    return segments

# Test the updated function with a specific sheep, month, and behaviour threshold
sample_segments_with_threshold = extract_time_segments_with_threshold(df, 'ov1.', 9, behaviour_threshold=90)

# Print the total number of extracted segments and display the first segment as an example
len(sample_segments_with_threshold), sample_segments_with_threshold[0] if sample_segments_with_threshold else None


(27,
     sheep_name  sheep_number  year  month  day  hours  minutes  seconds  \
 0         ov1.             1  2019      9   24     14       54        1   
 1         ov1.             1  2019      9   24     14       54        1   
 2         ov1.             1  2019      9   24     14       54        1   
 3         ov1.             1  2019      9   24     14       54        1   
 4         ov1.             1  2019      9   24     14       54        1   
 ..         ...           ...   ...    ...  ...    ...      ...      ...   
 251       ov1.             1  2019      9   24     14       54        7   
 252       ov1.             1  2019      9   24     14       54        7   
 253       ov1.             1  2019      9   24     14       54        7   
 254       ov1.             1  2019      9   24     14       54        7   
 255       ov1.             1  2019      9   24     14       54        7   
 
      event.no.     acc_x  ...     mag_y     mag_z  pitch.angle  roll.angle  \
 0

In [32]:
def extract_time_segments_with_threshold_v2(df, sheep_name, month, behaviour_threshold=90, segment_size=256):
    """
    Extracts non-overlapping segments of continuous data for a specified sheep and month,
    ensuring each segment has at least a certain percentage of the same behaviour. 
    Adds a new column indicating the majority behaviour in each segment.
    
    Parameters:
        - df: The DataFrame containing the sheep movement data.
        - sheep_name: The name of the sheep to filter the data.
        - month: The month to filter the data.
        - behaviour_threshold: The minimum percentage of rows with the same behaviour in each segment.
        - segment_size: The number of rows in each segment (default is 256 for 6.4s of data at 40Hz).
        
    Returns:
        - A list of DataFrames, each containing a continuous segment of data for the specified sheep and month
          with the behaviour meeting the threshold, and a new column for the majority behaviour.
    """
    # Filter the DataFrame based on the sheep_name and month
    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)].reset_index(drop=True)

    segments = []
    i = 0
    
    while i < len(filtered_df) - segment_size:
        segment = filtered_df.iloc[i : i+segment_size].copy()  # Make a copy to avoid warnings when adding new column
        
        # Calculate the percentage of the most common behaviour in the segment
        behaviours_count = segment['behaviours'].value_counts(normalize=True)
        most_common_behaviour = behaviours_count.index[0]
        most_common_behaviour_percentage = behaviours_count.iloc[0] * 100
        
        # Check if the behaviour meets the threshold
        if most_common_behaviour_percentage >= behaviour_threshold:
            # Add the majority behaviour label to the segment
            segment['behaviour_majority_label'] = most_common_behaviour
            segments.append(segment)
            i += segment_size  # Move to the next non-overlapping segment
        else:
            i += 1  # Move by one row and try again
    
    return segments

# Test the updated function with a specific sheep, month, and behaviour threshold
sample_segments_with_threshold_v2 = extract_time_segments_with_threshold_v2(df, 'ov1.', 9, behaviour_threshold=90)

# Print the total number of extracted segments and display the first segment as an example
len(sample_segments_with_threshold_v2), sample_segments_with_threshold_v2[0] if sample_segments_with_majority_label else None


(27,
     sheep_name  sheep_number  year  month  day  hours  minutes  seconds  \
 0         ov1.             1  2019      9   24     14       54        1   
 1         ov1.             1  2019      9   24     14       54        1   
 2         ov1.             1  2019      9   24     14       54        1   
 3         ov1.             1  2019      9   24     14       54        1   
 4         ov1.             1  2019      9   24     14       54        1   
 ..         ...           ...   ...    ...  ...    ...      ...      ...   
 251       ov1.             1  2019      9   24     14       54        7   
 252       ov1.             1  2019      9   24     14       54        7   
 253       ov1.             1  2019      9   24     14       54        7   
 254       ov1.             1  2019      9   24     14       54        7   
 255       ov1.             1  2019      9   24     14       54        7   
 
      event.no.     acc_x  ...     mag_z  pitch.angle  roll.angle  behaviours  \


In [None]:
def extract_consecutive_segments(df, sheep_name, month, behaviour_threshold=90, segment_size=256, sequence_length=20):
    """
    Extracts sequences of consecutive segments of continuous data for a specified sheep and month,
    ensuring each segment in the sequence has at least a certain percentage of the same behaviour.
    Adds a new column indicating the majority behaviour in each segment.
    
    Parameters:
        - df: The DataFrame containing the sheep movement data.
        - sheep_name: The name of the sheep to filter the data.
        - month: The month to filter the data.
        - behaviour_threshold: The minimum percentage of rows with the same behaviour in each segment.
        - segment_size: The number of rows in each segment (default is 256 for 6.4s of data at 40Hz).
        - sequence_length: The number of consecutive segments in each sequence.
        
    Returns:
        - A list of DataFrames, each containing a sequence of consecutive segments of data for the specified
          sheep and month with the behaviour meeting the threshold, and a new column for the majority behaviour.
    """

    # Filter the DataFrame based on the sheep_name and month
    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)].reset_index(drop=True)

    sequences = []
    i = 0
    
    while i < len(filtered_df) - segment_size * sequence_length:
        sequence = filtered_df.iloc[i : i + segment_size * sequence_length].copy()  
        
        # Split the sequence into segments and validate each segment
        segments = [sequence.iloc[j*segment_size : (j+1)*segment_size].copy() for j in range(sequence_length)]
        valid_sequence = all(
            segment['behaviours'].value_counts(normalize=True).iloc[0] * 100 >= behaviour_threshold
            for segment in segments
        )
        
        if valid_sequence:
            # Add the majority behaviour label to the segments
            for segment in segments:
                most_common_behaviour = segment['behaviours'].mode()[0]
                segment.loc[:, 'behaviour_majority_label'] = most_common_behaviour
            
            sequences.append(pd.concat(segments, ignore_index=True))
            i += segment_size * sequence_length  # Move to the next non-overlapping sequence
        else:
            i += segment_size  # Move by one segment and try again
    
    return sequences

# Test the updated function with a specific sheep, month, and a lower behaviour threshold of 60%
sample_consecutive_segments_60 = extract_consecutive_segments(df, 'ov1.', 9, behaviour_threshold=60)

# Print the total number of extracted sequences and display the first sequence as an example
len(sample_consecutive_segments_60), sample_consecutive_segments_60[0] if sample_consecutive_segments_60 else None


In [None]:
from itertools import product



def evaluate_parameter_combinations(df, sheep_name, month, segment_sizes, sequence_lengths, behaviour_thresholds):

    """

    Evaluate different combinations of parameters to extract consecutive segments of continuous data.

    

    Parameters:

        - df: The DataFrame containing the sheep movement data.

        - sheep_name: The name of the sheep to filter the data.

        - month: The month to filter the data.

        - segment_sizes: A list of segment sizes to evaluate.

        - sequence_lengths: A list of sequence lengths to evaluate.

        - behaviour_thresholds: A list of behaviour thresholds to evaluate.

        

    Returns:

        - A DataFrame containing the results of the evaluation for each combination of parameters.

    """

    # Create a DataFrame to store the results

    results_df = pd.DataFrame(columns=["Segment Size", "Sequence Length", "Behaviour Threshold", 

                                       "Number of Sequences", "Information Lost (%)"])



    # Filter the DataFrame based on the sheep_name and month

    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)]

    total_rows = len(filtered_df)



    # Iterate over all combinations of the parameters

    for segment_size, sequence_length, behaviour_threshold in product(segment_sizes, sequence_lengths, behaviour_thresholds):

        # Extract sequences using the current combination of parameters

        sequences = extract_consecutive_segments(filtered_df, sheep_name, month, 

                                                       behaviour_threshold, segment_size, sequence_length)

        

        # Calculate the number of rows included in the extracted sequences

        included_rows = sum(len(sequence) for sequence in sequences)

        

        # Calculate the information lost

        information_lost = 100 * (total_rows - included_rows) / total_rows if total_rows > 0 else 0

        

        # Append the results to the results DataFrame

        results_df = results_df.concat({

            "Segment Size": segment_size,

            "Sequence Length": sequence_length,

            "Behaviour Threshold": behaviour_threshold,

            "Number of Sequences": len(sequences),

            "Information Lost (%)": information_lost

        }, ignore_index=True)



    return results_df



# Define the possible values for each parameter

segment_sizes = [256, 128, 64]

sequence_lengths = [20, 15, 10, 5]

behaviour_thresholds = [100, 95, 90, 85, 80]



# Evaluate the combinations of parameters for a specific sheep and month

evaluation_results = evaluate_parameter_combinations(df, 'ov1.', 9, segment_sizes, sequence_lengths, behaviour_thresholds)



# Display the results

evaluation_results


In [71]:
def evaluate_parameter_combinations(df, sheep_name, month, segment_sizes, sequence_lengths, behaviour_thresholds):
    """

    Evaluate different combinations of parameters to extract consecutive segments of continuous data.

    

    Parameters:

        - df: The DataFrame containing the sheep movement data.

        - sheep_name: The name of the sheep to filter the data.

        - month: The month to filter the data.

        - segment_sizes: A list of segment sizes to evaluate.

        - sequence_lengths: A list of sequence lengths to evaluate.

        - behaviour_thresholds: A list of behaviour thresholds to evaluate.

        

    Returns:

        - A DataFrame containing the results of the evaluation for each combination of parameters.

    """


    results = []

    filtered_df = df[(df['sheep_name'] == sheep_name) & (df['month'] == month)]

    total_rows = len(filtered_df)



    for segment_size, sequence_length, behaviour_threshold in product(segment_sizes, sequence_lengths, behaviour_thresholds):

        sequences = extract_consecutive_segments_fixed(

            filtered_df, sheep_name, month, behaviour_threshold, segment_size, sequence_length)

        included_rows = sum(len(sequence) for sequence in sequences)

        information_lost = 100 * (total_rows - included_rows) / total_rows if total_rows > 0 else 0

        

        results.append({

            "Segment Size": segment_size,

            "Sequence Length": sequence_length,

            "Behaviour Threshold": behaviour_threshold,

            "Number of Sequences": len(sequences),

            "Information Lost (%)": information_lost

        })

    
    #results.sort_values(by = ['Segment Size', 'Sequence Length'])
    #results.sort_values(by = ["Information Lost (%)"])

    return pd.DataFrame(results).sort_values(by = ["Information Lost (%)", 'Sequence Length', 'Segment Size'])



# Re-run the evaluation

evaluation_results = evaluate_parameter_combinations(

    df, 'ov1.', 9, segment_sizes, sequence_lengths, behaviour_thresholds)



# Display the results

evaluation_results

Unnamed: 0,Segment Size,Sequence Length,Behaviour Threshold,Number of Sequences,Information Lost (%)
57,64,5,90,21,10.898966
58,64,5,85,21,10.898966
59,64,5,80,21,10.898966
55,64,5,100,20,15.141872
56,64,5,95,20,15.141872
39,128,5,80,10,15.141872
52,64,10,90,10,15.141872
53,64,10,85,10,15.141872
54,64,10,80,10,15.141872
35,128,5,100,9,23.627685


In [59]:
evaluation_results_updated.sort_values(by = ['Segment Size', 'Sequence Length'])

Unnamed: 0,Segment Size,Sequence Length,Behaviour Threshold,Number of Sequences,Information Lost (%)
55,64,5,100,20,15.141872
56,64,5,95,20,15.141872
57,64,5,90,21,10.898966
58,64,5,85,21,10.898966
59,64,5,80,21,10.898966
50,64,10,100,9,23.627685
51,64,10,95,9,23.627685
52,64,10,90,10,15.141872
53,64,10,85,10,15.141872
54,64,10,80,10,15.141872


In [57]:
type(evaluation_results_updated)

pandas.core.frame.DataFrame

Explanation of each column in the table:

    Segment Size: The number of rows in each segment.
    Sequence Length: The number of consecutive segments in each sequence.
    Behaviour Threshold: The minimum percentage of rows with the same behaviour in each segment.
    Number of Sequences: The total number of sequences extracted with these parameters.
    Information Lost (%): The percentage of rows that were not included in any extracted sequence, indicating the amount of data "lost" or unused.

From this table, you can analyze how changing the segment_size, sequence_length, and behaviour_threshold affects the number of sequences you can extract and the amount of information lost.

In [49]:
type(df)

pandas.core.frame.DataFrame