This was copied from the 2023 working version. It has not been updated for 2022 data completely yet.

In [1]:
from src.data.process_utils import parse_file, compile_csv_logs, combine_data_files, remove_tags, create_tag_range
import os
from pathlib import Path

project_directory = Path(os.getcwd()).parents[0]

## Step 1: Review detection counts in raw data files
In this first step we will extract the detections from each raw datafile. Counts of line detections are noted to a log file. Low or no detections in a file indicate that the file may not be formatted correctly. Comparing counts between files also allows us to determine file duplicates.

In [2]:
# 2022 Inputs
file_list = ['CoolTerm Capture 2022-03-16 15-10-27.txt',
             'CoolTerm Capture 2022-05-20 16-00-09_MST.txt',
             'CoolTerm Capture 2022-06-02 15-11-05_MST.txt',
             'CoolTerm Capture 2022-06-02 15-35-43_IL.txt',
             'CoolTerm Capture 2022-05-02 10-19-19_MST.txt',
             'CoolTerm Capture 2022-04-18 13-03-01 INL.txt',
             'CoolTerm Capture 2022-03-16 14-24-44.txt',
             'MST_20220407.txt',
             'CoolTerm Capture 2022-05-06 11-39-27_INL.txt',
             'CoolTerm Capture 2022-04-25 10-33-10_INL.txt',
             'CoolTerm Capture 2022-04-29 10-49-08_MST.txt',
             'CoolTerm Capture 2022-04-18 09-36-07.txt',
             'CoolTerm Capture 2022-05-20 15-13-30_INL.txt',
             'CoolTerm Capture 2022-04-18 09-44-26 Final Upload.txt',
             'CoolTerm Capture 2022-05-02 12-48-06_INL.txt',
             '2022-06-16 09-25-12_MS.txt',
             'CoolTerm Capture 2022-04-18 13-03-01 INL.csv',
             'CoolTerm Capture 2022-04-29 11-10-58_INL.txt',
             'INL_20220407.txt',
             'CoolTerm Capture 2022-04-25 11-05-52_MST.txt',
             'CoolTerm Capture 2022-04-18 09-41-13.txt',
             '2022-06-16 11-07-25_IL.txt',
             'CoolTerm Capture 2022-05-06 10-37-49_MST.txt']
input_directory = project_directory / 'data/raw/2022'
interim_directory = project_directory / 'data/interim/2022'

In [3]:
# Parses each raw file. Removes the lines that do not look like detection information.
for file in file_list:
    parse_file(input_file=project_directory / input_directory / file, output_dir=interim_directory)

# Complies the log generated for each file into one log. Logs contain counts of detections.
compile_csv_logs(interim_directory, 'processing_logs_compiled.csv')

Processing of "INL 2023-04-04 11-23-27.txt" complete.
Processing of "INL 2023-04-11.txt" complete.
Processing of "INL 2023-04-18 13-49-26.txt" complete.
Processing of "INL 2023-04-27 12-09-44.txt" complete.
Processing of "INL 2023-05-08 11-40-17.txt" complete.
Processing of "INL 2023-05-09 12-05-47.txt" complete.
Processing of "INL 2023-05-17 14-56-03.txt" complete.
Processing of "INL 2023-06-02 14-43-21.txt" complete.
Processing of "INL 2023-06-13 12-24-05.txt" complete.
Processing of "INL re-try 2023-04-11.txt" complete.
Processing of "MST 05-08 retry.txt" complete.
Processing of "MST 2023-04-04 10-02-34.txt" complete.
Processing of "MST 2023-04-11.txt" complete.
Processing of "MST 2023-04-18 10-24-34.txt" complete.
Processing of "MST 2023-04-27.txt" complete.
Processing of "MST 2023-05-08 10-21-50.txt" complete.
Processing of "MST 2023-05-17 14-12-07.txt" complete.
Processing of "MST 2023-06-02 13-58-22.txt" complete.
Processing of "MST 2023-06-13 11-15-03.txt" complete.
Processing 

## Step 2: Combining the detection data and filtering for valid tag IDs
Once we have reviewed the logs and selected the files want to combine we will combine all the detection files and remove unrecognized tags.

In [5]:
# Combine the detection data into a dataframe
files_to_combine = [
    'INL 2023-04-04 11-23-27_detection.csv',
    'INL 2023-04-11_detection.csv',
    'INL 2023-04-18 13-49-26_detection.csv',
    'INL 2023-04-27 12-09-44_detection.csv',
    'INL 2023-05-08 11-40-17_detection.csv',
    'INL 2023-05-09 12-05-47_detection.csv',
    'INL 2023-05-17 14-56-03_detection.csv',
    'INL 2023-06-02 14-43-21_detection.csv',
    'INL 2023-06-13 12-24-05_detection.csv',
    'MST 05-08 retry_detection.csv',
    'MST 2023-04-04 10-02-34_detection.csv',
    'MST 2023-04-11_detection.csv',
    'MST 2023-04-18 10-24-34_detection.csv',
    'MST 2023-04-27_detection.csv',
    'MST 2023-05-17 14-12-07_detection.csv',
    'MST 2023-06-02 13-58-22_detection.csv',
    'MST 2023-06-13 11-15-03_detection.csv',
    'PAL 2023-04-04 10-36-49_detection.csv',
    'PAL 2023-04-11_detection.csv',
    'PAL 2023-04-18 12-31-22_detection.csv',
    'PAL 2023-05-08 11-26-16_detection.csv',
    'PAL 2023-05-17 14-40-35_detection.csv',
    'PAL 2023-06-02 14-25-46_detection.csv',
    'PAL 2023-06-13 09-39-00_detection.csv',
    'PAL retry 4-27_detection.csv',]

processed_directory = project_directory / 'data/processed/2023'

# Concat files into a dataframe
df_combined = combine_data_files(interim_directory, files_to_combine)

# Remove site marker tags
l_marker_tags = ['0000_0000000000005126', '0000_0000000000012627', '0000_0000000000012617']
df_mt_removed = remove_tags(df_combined, l_marker_tags)

# Create list of valid fish tags for 2023
valid_tags_1 = create_tag_range('900_228000487', (900, 995), pad=3)
valid_tags_2 = create_tag_range('900_228000498', (0,211), pad=3)
valid_tags = valid_tags_1 + valid_tags_2

# Filter for valid 2023 fish tags
mask_valid = df_mt_removed['TAG'].isin(valid_tags)
df_valid_fish = df_mt_removed[mask_valid].reset_index(drop=True)

# Check for duplicates on datetime, tag_id, and site
m_dups = df_valid_fish.duplicated(subset=['ARR','TAG', 'SCD'], keep=False)
if (n_dups := m_dups.sum()) > 0:
    print(f"There are {n_dups} duplicate records.")
else:
    print("No duplicates detected.")

# Sort the data by datetime and site
import pandas as pd
df_valid_fish['ARR']= pd.to_datetime(df_valid_fish['ARR'])
df_valid_fish = df_valid_fish.sort_values(['SCD', 'ARR'])

There are 145 duplicate records.


In [None]:
df_valid_fish[m_dups]

In [12]:
# Export Data
filename = 'combined_valid-tag_detections_2023.csv'
df_valid_fish.to_csv(processed_directory / filename, index=False)


In [17]:
df_valid_fish.TAG.value_counts()


TAG
900_228000487901    2309
900_228000487940    1925
900_228000487979     906
900_228000487912     844
900_228000487952     819
                    ... 
900_228000487919       9
900_228000487981       3
900_228000487924       2
900_228000487966       2
900_228000487927       1
Name: count, Length: 74, dtype: int64

In [22]:
df_valid_fish['TAG'].value_counts().reset_index().sort_values(by='TAG')


Unnamed: 0,TAG,count
0,900_228000487901,2309
33,900_228000487903,87
51,900_228000487904,26
53,900_228000487905,24
47,900_228000487910,36
...,...,...
45,900_228000487986,39
34,900_228000487987,80
67,900_228000487988,10
37,900_228000487989,67


In [30]:
df_combined['TAG'].value_counts()


TAG
900_228000487631         3503
900_228000487705         3488
0000_0000000000005126    3277
0000_0000000000012627    3259
0000_0000000000012617    3207
                         ... 
TAG                         1
2029_4503599493152767       1
0000_0495236233474610       1
0000_0495236233475104       1
0000_0495236233475100       1
Name: count, Length: 225, dtype: int64

In [33]:
df_combined[df_combined['TAG'].str.startswith("900_228000498")].loc[:,'TAG'].value_counts()


TAG
900_228000498023    1899
900_228000498006    1184
900_228000498141     803
900_228000498204     638
900_228000498206     629
                    ... 
900_228000498034       3
900_228000498120       2
900_228000498180       1
900_228000498038       1
900_228000498014       1
Name: count, Length: 108, dtype: int64