In [6]:
from bs4 import BeautifulSoup
import re

def read_html_file(file_path):
    """
    Reads the content of an HTML file.

    Parameters:
    - file_path (str): The path to the HTML file.

    Returns:
    - str: The content of the HTML file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def extract_text_from_class(html_content, class_name):
    """
    Extracts all text items from elements with the specified class name.

    Parameters:
    - html_content (str): The HTML content to parse.
    - class_name (str): The class name to search for.

    Returns:
    - list: A list of text items extracted from the specified class.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    elements = soup.find_all(class_=class_name)
    text_items = [re.sub(r'\s+', ' ', element.get_text()).strip() for element in elements]
    return text_items

def save_text_to_file(text_items, output_file_path):
    """
    Saves the list of text items to a text file with \n as separator.

    Parameters:
    - text_items (list): The list of text items to save.
    - output_file_path (str): The path to the output text file.
    """
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(text_items))

# Example usage
file_path = './test.html'  # Replace with the path to your HTML file
class_name = 'sc-1dsz9yo-7 eNkWzS'
output_file_path = 'extracted_text.txt'  # Replace with the desired output file path

html_content = read_html_file(file_path)
text_items = extract_text_from_class(html_content, class_name)
save_text_to_file(text_items, output_file_path)
print(f"Extracted text items have been saved to {output_file_path}")


Extracted text items have been saved to extracted_text.txt
