In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
# Function to read an annotation file and return a DataFrame
def read_annotation_file(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return df

# Function to process a directory of annotation files
def process_annotation_aggregated(directory_path):
    # Initialize an empty DataFrame to store aggregated data
    aggregated_df = pd.DataFrame(columns=['Label', 'SampleCount', 'TotalSampleTime'])

    # Loop through each file in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".txt") and "BG" not in file_name:
            file_path = os.path.join(directory_path, file_name)
            annotation_df = read_annotation_file(file_path)

            # Aggregate data
            label_counts = annotation_df['Annotation'].value_counts()
            total_sample_time = annotation_df['End Time (s)'].max()

            # Update aggregated DataFrame
            for label, count in label_counts.items():
                if label not in aggregated_df['Label'].tolist():
                    aggregated_df = aggregated_df.append({'Label': label, 'SampleCount': 0, 'TotalSampleTime': 0}, ignore_index=True)
                aggregated_df.loc[aggregated_df['Label'] == label, 'SampleCount'] += count
                aggregated_df.loc[aggregated_df['Label'] == label, 'TotalSampleTime'] += total_sample_time

    return aggregated_df

# Function to process a directory of annotation files
def process_annotation_directory(directory_path):
    # Initialize empty DataFrames to store regular and background annotations
    # regular_annotations = pd.DataFrame(columns=['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)',
    #                                             'Low Freq (Hz)', 'High Freq (Hz)', 'Annotation', 'File'])
    # background_annotations = pd.DataFrame(columns=['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)',
    #                                                'Low Freq (Hz)', 'High Freq (Hz)', 'Annotation', 'File'])
    all_dataframe = pd.DataFrame(columns=['Selection', 'View', 'Channel', 'Begin Time (s)', 'End Time (s)',
                                                'Low Freq (Hz)', 'High Freq (Hz)', 'Annotation', 'File'])
    
    # Loop through each file in the directory
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)

        if file_name.endswith(".txt"):
            annotation_df = read_annotation_file(file_path)

            # Add a new column to store the file name
            annotation_df['File'] = file_name

            all_dataframe = pd.concat([all_dataframe, annotation_df], ignore_index=True)

    aggregated_df = pd.DataFrame(columns=['Label', 'SampleCount', 'TotalSampleTime'])

    # Aggregate data
    label_counts = all_dataframe['Annotation'].value_counts()
    all_dataframe['Duration (s)'] = all_dataframe['End Time (s)']-all_dataframe['Begin Time (s)']
    
    # print(total_duration_per_label)
    # print(all_dataframe['Annotation'].value_counts())
    # Update aggregated DataFrame
    # for label, count in label_counts.items():
        # if label not in aggregated_df['Label'].tolist():
        #     aggregated_df = aggregated_df[{'Label': label, 'SampleCount': 0, 'TotalSampleTime': 0}, ignore_index=True)
        # aggregated_df.loc[aggregated_df['Label'] == label, 'SampleCount'] += count
        # aggregated_df.loc[aggregated_df['Label'] == label, 'TotalSampleTime'] += total_sample_time
            
    return all_dataframe


In [20]:

# Set the directory path where your annotation files are located
directory_path = 'C:/Users/amitg/Documents/Deep_Voice/ocean-whispers/Anotations Ocean Wispers'
directory_2_path = 'C:/Users/amitg/Documents/Deep_Voice/ocean-whispers/Background noise annotations'

# Process the annotation directory
data_batch1 = process_annotation_directory(directory_path)
data_batch2 = process_annotation_directory(directory_2_path)

all_data = pd.concat([data_batch1, data_batch2], ignore_index=True)

# Display the aggregated data
print(all_data)
total_duration_per_label = all_data.groupby('Annotation')['Duration (s)'].sum()
print(total_duration_per_label)
print(all_data['Annotation'].value_counts())


     Selection           View Channel  Begin Time (s)  End Time (s)  \
0            1  Spectrogram 1       2       34.228464     34.778080   
1            2  Spectrogram 1       2      228.859449    229.126830   
2            3  Spectrogram 1       2      315.197472    315.516843   
3            4  Spectrogram 1       2      315.969905    316.300418   
4            5  Spectrogram 1       2      316.820325    317.106274   
...        ...            ...     ...             ...           ...   
1593         3  Spectrogram 1       2       52.601485     57.866612   
1594         2  Spectrogram 1       2       36.423814     41.373775   
1595         1  Spectrogram 1       2        4.061849     11.383951   
1596         2  Spectrogram 1       2      102.032374    117.378853   
1597         1  Spectrogram 1       2        0.000000    101.343425   

      Low Freq (Hz)  High Freq (Hz) Annotation  \
0            5421.8          9408.3          d   
1            3830.7          6848.9          d 

  all_dataframe = pd.concat([all_dataframe, annotation_df], ignore_index=True)
  all_dataframe = pd.concat([all_dataframe, annotation_df], ignore_index=True)


In [51]:
# p_df = all_data[all_data['Annotation']=='p']
# for key,file_df in p_df.groupby(['File']):
    # print(file_df)
    # p_df['diff'] = p_df['Begin Time (s)'].shift(1) - p_df['End Time (s)']
def merge_close_p_calls(file_df):
    merged_rows = []

    file_df = file_df.sort_values(by='Begin Time (s)')  # Sort by Begin Time (s) to ensure chronological order

    idx = 0
    while idx < len(file_df):
        current_row = file_df.iloc[idx]
        print(idx)
        if current_row['Annotation'] == 'p' and current_row['Duration (s)'] < 1 and idx < len(file_df) - 1:
            next_rows = file_df.iloc[idx + 1:]
            merged_row = current_row.copy()
            merged = False
            last_merged_idx = idx
            print("short call:", current_row['Selection'])

            while len(next_rows) > 0:
                next_row = next_rows.iloc[0]
                time_difference = next_row['Begin Time (s)'] - merged_row['End Time (s)']

                if time_difference <= 0.5:
                    # Update the merged row's End Time (s)
                    merged_row['End Time (s)'] = next_row['End Time (s)']
                    next_rows = next_rows.iloc[1:]  # Remove the merged row from consideration
                    merged = True
                    print("close call:", next_row['Selection'])
                    last_merged_idx = file_df.index.get_loc(next_row.name)
                else:
                    # Stop merging if the time difference exceeds 0.5 seconds
                    break

            if not merged:
                # If no merge occurred, add the current row as is
                merged_rows.append(merged_row)

            idx = last_merged_idx + 1  # Continue from the next index after the last merged row
        else:
            # Keep non-'p' rows as is
            merged_row = current_row
            idx += 1

        merged_rows.append(merged_row)

    merged_df = pd.DataFrame(merged_rows)
    return merged_df

merged_dfs = []
for key, file_df in all_data.groupby(['File']):
    # Merge consecutive 'p' calls with durations shorter than 1 second and close Begin Times
    merged_df = merge_close_p_calls(file_df)
    merged_dfs.append(merged_df)

# Concatenate the merged dataframes back together
final_merged_dataframe = pd.concat(merged_dfs, ignore_index=True)

# Display the resulting merged dataframe
print(final_merged_dataframe)
# merged_file_df = merge_close_p_calls(file_df)
# print(merged_file_df)
# print(key)
#TODO: Find the problem in the function gepeto wrote, it should merge p calls that are close together (trains) and leave p's that are alone. >DONE
#Then I want to concat back the file_dfs to one df that has all the files with the p's merged.
#finally I want to create a dataset for training with split.

0
1
2
3
4
5
6
7
8
9
10
11
12
0
1
2
3
4
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
0
1
2
3
4
5
6
7
8
9
0
1
2
3
4
5
6
7
8
9
10
11
12
short call: 4
close call: 10
14
15
16
17
18
19
20
21
22
0
1
2
3
4
0
1
2
short call: 1
close call: 3
4
5
6
7
8
9
short call: 4
close call: 7
11
12
13
14
15
16
17
18
19
20
21
short call: 15
close call: 14
23
24
25
short call: 16
26
short call: 17
27
short call: 18
close call: 12
29
30
31
32
33
34
35
short call: 20
36
short call: 21
close call: 18
38
39
short call: 22
40
short call: 23
41
42
43
short call: 24
close call: 22
45
short call: 25
46
47
short call: 26
48
short call: 27
49
50
short call: 28
51
52
short call: 29
53
54
55
56
57
58
59
0
1
2
3
4
5
6
7
0
1
short call: 1
2
short call: 2
3
short call: 3
close call: 2
close call: 4
close call: 5
7
short call: 6
8
short call: 7
9
short call: 8
10
11
12
13
short call: 9
14
0
1
2
3
4
0
short c

In [66]:
from sklearn.model_selection import train_test_split

# Remove file extension and "_BG" suffix from the "File" column
final_merged_dataframe['File'] = final_merged_dataframe['File'].str.replace(r'\.txt_BG\.txt$', '', regex=True)
final_merged_dataframe['File'] = final_merged_dataframe['File'].str.replace(r'\.txt$', '', regex=True)
# Strip leading and trailing whitespaces from filenames
final_merged_dataframe['File'] = final_merged_dataframe['File'].str.strip()

# Select samples for the 3 most common labels: 'p', 'd', and 'BG'
selected_labels = ['p', 'd', 'BG']
selected_df = final_merged_dataframe[final_merged_dataframe['Annotation'].isin(selected_labels)]

# Ensure samples from the same filename are not in both training and validation/testing sets
unique_files = selected_df['File'].unique()
train_files, test_val_files = train_test_split(unique_files, test_size=0.4, random_state=42)
val_files, test_files = train_test_split(test_val_files, test_size=0.5, random_state=42)

# train_files, test_val_files = train_test_split(selected_df['File'].unique(), test_size=0.4, random_state=42)
# val_files, test_files = train_test_split(test_val_files, test_size=0.5, random_state=42)

# Add a column indicating whether each sample is for training, validation, or testing
selected_df['Set'] = 'train'
selected_df.loc[selected_df['File'].isin(val_files), 'Set'] = 'val'
selected_df.loc[selected_df['File'].isin(test_files), 'Set'] = 'test'

# Display the resulting dataframe
print(selected_df)
selected_df.to_csv('ocean_whispers_2020_train_val_test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['Set'] = 'train'


      Selection           View  Channel  Begin Time (s)  End Time (s)  \
0             1  Spectrogram 1        2       34.228464     34.778080   
1             2  Spectrogram 1        2      228.859449    229.126830   
2             3  Spectrogram 1        2      315.197472    315.516843   
3             4  Spectrogram 1        2      315.969905    316.300418   
4             5  Spectrogram 1        2      316.820325    317.106274   
...         ...            ...      ...             ...           ...   
1122          7  Spectrogram 1        2      120.449786    120.614213   
1123          1  Spectrogram 1        2        0.000000    101.343425   
1124          1  Spectrogram 1        2        0.000000    101.343425   
1125          2  Spectrogram 1        2      102.032374    117.378853   
1126          2  Spectrogram 1        2      102.032374    117.378853   

      Low Freq (Hz)  High Freq (Hz) Annotation  \
0            5421.8          9408.3          d   
1            3830.7    

PermissionError: [Errno 13] Permission denied: 'ocean_whispers_2020_train_val_test.csv'

In [67]:
# Get unique filenames for each set
train_filenames = selected_df[selected_df['Set'] == 'train']['File'].unique().tolist()
val_filenames = selected_df[selected_df['Set'] == 'val']['File'].unique().tolist()
test_filenames = selected_df[selected_df['Set'] == 'test']['File'].unique().tolist()

# Display the lists of unique filenames for each set
print("Train Filenames:", train_filenames)
print("Validation Filenames:", val_filenames)
print("Test Filenames:", test_filenames)


Train Filenames: ['LPS1382019_HF_20200411_090759_349.Table.1.selections', 'LPS1382019_HF_20200411_091000_000.Table.1.selections', 'LPS1382019_HF_20200411_092000_000.Table.1.selections', 'LPS1382019_HF_20200411_092759_349.Table.1.selections', 'LPS1382019_HF_20200411_094000_000.Table.1.selections', 'LPS1382019_HF_20200411_095000_000.Table.1.selections', 'LPS1382019_HF_20200411_100759_349.Table.1.selections', 'LPS1382019_HF_20200411_102000_000.Table.1.selections', 'LPS1382019_HF_20200411_103000_000.Table.1.selections', 'LPS1382019_HF_20200411_103759_349.Table.1.selections', 'LPS1382019_HF_20200411_104000_000.Table.1.selections', 'LPS1382019_HF_20200411_104759_349.Table.1.selections', 'LPS1382019_HF_20200430_131759_349.Table.1.selections']
Validation Filenames: ['LPS1382019_HF_20200411_085000_000.Table.1.selections', 'LPS1382019_HF_20200411_085759_349.Table.1.selections', 'LPS1382019_HF_20200411_090000_000.Table.1.selections', 'LPS1382019_HF_20200411_093000_000.Table.1.selections', 'LPS138