In [6]:
import os
import re

def check_transcription_looping(file_path, check_words=20, min_repeat_length=10, repeat_threshold=0.7):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().strip()

    words = content.split()
    if len(words) < check_words * 2:
        return False  # File is too short for meaningful check

    # Check the last 'check_words' against the preceding text
    end_words = words[-check_words:]
    preceding_text = ' '.join(words[:-check_words])

    for i in range(len(end_words) - min_repeat_length + 1):
        phrase = ' '.join(end_words[i:i+min_repeat_length])
        if phrase in preceding_text:
            # Found a repeating phrase, now check for extended repetition
            extended_phrase = ' '.join(end_words[i:])
            matches = sum(1 for word in extended_phrase.split() if word in preceding_text.split()[-len(extended_phrase.split()):])
            if matches / len(extended_phrase.split()) >= repeat_threshold:
                return True

    return False

def process_folder(folder_path):
    looping_files = []
    total_files = 0
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            total_files += 1
            file_path = os.path.join(folder_path, filename)
            
            try:
                if check_transcription_looping(file_path):
                    looping_files.append(filename)
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    return looping_files, total_files

# Example usage
folder_path = 'transcriptions_BGpixtral/txt/'
looping_files, total_files = process_folder(folder_path)

print(f"Processed {total_files} files in total.")
print(f"Number of files with potential transcription looping: {len(looping_files)}")

if looping_files:
    print("\nFiles with potential transcription looping:")
    for file in looping_files:
        print(f"- {file}")
else:
    print("No transcription looping detected in any files.")

Processed 500 files in total.
Number of files with potential transcription looping: 16

Files with potential transcription looping:
- 32044150448397_002.txt
- 32044150448439_024.txt
- 32044150448439_033.txt
- 32044150448496_042.txt
- 32044150448512_020.txt
- 32044150448538_032.txt
- 32044150448538_036.txt
- 32044150448546_040.txt
- 32044150448611_024.txt
- 32044150448769_003.txt
- 32044150448793_026.txt
- 32044150448827_048.txt
- 32044150448843_019.txt
- 32044150448850_034.txt
- 32044150448983_017.txt
- 32044150449114_009.txt


In [42]:
import os
import re

def check_transcription_looping(file_path, check_words=20, min_repeat_length=10, repeat_threshold=0.7):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().strip()

    words = content.split()
    if len(words) < check_words * 2:
        return False  # File is too short for meaningful check

    # Check the last 'check_words' against the preceding text
    end_words = words[-check_words:]
    preceding_text = ' '.join(words[:-check_words])

    for i in range(len(end_words) - min_repeat_length + 1):
        phrase = ' '.join(end_words[i:i+min_repeat_length])
        if phrase in preceding_text:
            # Found a repeating phrase, now check for extended repetition
            extended_phrase = ' '.join(end_words[i:])
            matches = sum(1 for word in extended_phrase.split() if word in preceding_text.split()[-len(extended_phrase.split()):])
            if matches / len(extended_phrase.split()) >= repeat_threshold:
                return True

    return False

def process_folder(folder_path):
    looping_files = []
    total_files = 0
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            total_files += 1
            file_path = os.path.join(folder_path, filename)
            
            try:
                if check_transcription_looping(file_path):
                    looping_files.append(filename)
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    return looping_files, total_files

# Example usage
folder_path = 'transcriptionsBG_internVL/txt/'
looping_files, total_files = process_folder(folder_path)

print(f"Processed {total_files} files in total.")
print(f"Number of files with potential transcription looping: {len(looping_files)}")

if looping_files:
    print("\nFiles with potential transcription looping:")
    for file in looping_files:
        print(f"- {file}")
else:
    print("No transcription looping detected in any files.")

Processed 500 files in total.
Number of files with potential transcription looping: 0
No transcription looping detected in any files.


In [8]:
import os

folder_path = 'transcriptions_BGpixtral/txt/'
counter = 0

# Iterate over all the files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    # Check if the path is a file (not a folder)
    if os.path.isfile(file_path):
        # Open and read the file content
        with open(file_path, 'r', encoding="utf-8") as f:
            content = f.read()

        # Check if the phrase is in the content
        if "I'm sorry, I can't assist with that." in content:
            counter += 1
            print(f'File containing the phrase: {filename}')

print(f'\nTotal files with the phrase: {counter}')




Total files with the phrase: 0


In [33]:
import os

folder_path = 'transcriptionsBG_gptmini/txt/'
counter = 0

# Iterate over all the files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    # Check if the path is a file (not a folder)
    if os.path.isfile(file_path):
        # Open and read the file content
        with open(file_path, 'r', encoding="utf-8") as f:
            content = f.read()

        # Check if the phrase is in the content
        if "I'm sorry, but I can't assist with that." in content:
            counter += 1
            print(f'File containing the phrase: {filename}')

print(f'\nTotal files with the phrase: {counter}')

File containing the phrase: 32044150448389_052.txt

Total files with the phrase: 1
