In [1]:
import pandas as pd
import re

In [4]:
def cleanup(input_str):
    """
    Cleans the input string by removing unnecessary information such as image links,
    player controls, time codes, and redundant whitespace.

    Args:
        input_str (str): The string to be cleaned.

    Returns:
        str: The cleaned string.
    """
    # Split the input string into individual lines
    lines = input_str.split('\n')
    
    cleaned_lines = []
    
    # Define patterns to identify unnecessary lines
    image_pattern = re.compile(r'^!\[\]\(.*\)|Файл не найден', re.IGNORECASE)
    player_controls = ['PausePlay', 'UnmuteMute', 'Exit fullscreen', 'Enter fullscreen']
    time_code_patterns = [
        re.compile(r'^\d{1,2}:\d{2}(-\d{1,2}:\d{2})?$'),        # e.g., 00:00 or 00:00-00:45
        re.compile(r'^% buffered\d{2}:\d{2}$')                  # e.g., % buffered00:00
    ]
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Remove image links
        if image_pattern.search(line):
            continue
        
        # Remove player controls
        if any(control in line for control in player_controls):
            continue
        
        # Remove time codes
        if any(pattern.match(line) for pattern in time_code_patterns):
            continue
        
        # Remove bullet points (optional: remove '*' and leading spaces)
        line = re.sub(r'^[*\-]\s+', '', line)
        
        # Optionally, remove lines that are solely numbers or symbols
        if re.match(r'^[\d\s\-:]+$', line):
            continue
        
        # Append the cleaned line
        cleaned_lines.append(line)
    
    # Join the cleaned lines back into a single string
    cleaned_str = '\n'.join(cleaned_lines)
    
    return cleaned_str

In [6]:
#df = pd.read_csv('./data/articles_data_summ_OK.csv', encoding="utf-8")
df = pd.read_csv('./data/articles_data_summ.csv', encoding="utf-8")

df['validated'] = ''
df['refs'] = df['refs'].apply(cleanup)

df.to_csv(
                './data/articles_data_summ_cleaned.csv',
                index=False
            )

