In [35]:
import pandas as pd
import re
import json

from bs4 import BeautifulSoup

def extract_text_from_html(html_content, target_class="kLnDsmZC8c49r2Ntz8LHD"):
    """
    Extracts text content from paragraphs within an HTML string,
    targeting a specific class.

    Args:
        html_content (str): The HTML string to parse.
        target_class (str): The CSS class of the paragraphs to extract.

    Returns:
        str: A string containing the extracted text, joined by newlines.
    """
    if not isinstance(html_content, str): # Check if the input is a string.
        if pd.isna(html_content): # Check for NaN values
            return "" # Return empty string for NaN
        else:
            return str(html_content) # Return the string representation of the number
        
    soup = BeautifulSoup(html_content, 'html.parser')
    
    katex_spans = soup.find_all('span', class_='katex')
    for span in katex_spans:
        annotation = span.find('annotation', encoding='application/x-tex')
        if annotation:
            # Find the mrow tag inside math tag
            math_tag = annotation.find_parent('span', class_='katex-mathml').find('math')
            if math_tag:
                mrow_tag = math_tag.find('mrow')
                if mrow_tag:
                    mrow_tag.decompose()
            
        # Extract text from MathML
        mathml_span = span.find_parent('span', class_='katex-mathml')
        if mathml_span:
            math_tag = mathml_span.find('math')
            if math_tag:
                span.string = math_tag.get_text()
    forms = soup.find_all('form')
    for form in forms:
        dot_space_tag = soup.new_string('. ')
        form.insert_before(dot_space_tag)



    # Modify table content directly and add newlines before tables
    table_tags = soup.find_all('table')
    for table in table_tags:
        new_table_text = "\n"  # Add newline before each table
        for row in table.find_all('tr'):
            row_text = ""
            for cell in row.find_all('td'):
                row_text += cell.get_text(strip=True) + " "  # Add 1 space
            new_table_text += row_text.strip() + "\n"
        table.replace_with(BeautifulSoup(f"<br/><pre>{new_table_text.strip()}</pre>", 'html.parser')) #replace table with preformatted text
    # Replace SVG tags with placeholders
    svg_placeholders = {}
    svg_count = 0
    for svg_tag in soup.find_all('svg'):
        placeholder = f"__SVG_PLACEHOLDER_{svg_count}__"
        svg_placeholders[placeholder] = str(svg_tag)
        svg_tag.replace_with(placeholder)
        svg_count += 1
    
    extracted_text = soup.get_text() #extract from modified soup
    # extracted_text = latex_to_text(extracted_text) #convert latex to text
    extracted_text = format_frequency_table(extracted_text) #remove leading and trailing whitespaces
    extracted_text = remove_markup_text(extracted_text) #remove markup text
    extracted_text = convert_latex_table_to_text(extracted_text) #convert latex tabular to text
    for placeholder, svg_string in svg_placeholders.items():
        extracted_text = extracted_text.replace(placeholder, "")

    print(extracted_text)
    return extracted_text

def latex_to_text(latex_string):
    """Converts LaTeX fraction and parenthesis expressions to plain text."""

    # Replace \left( and \right), \Bigg( and \Bigg) with parentheses
    latex_string = latex_string.replace(r"\left(", "(").replace(r"\right)", ")")
    latex_string = latex_string.replace(r"\Bigg(", "(").replace(r"\Bigg)", ")")

    # Replace \dfrac{numerator}{denominator} and \frac{numerator}{denominator} with numerator/denominator
    latex_string = re.sub(r"\\d?frac\{([^}]*)\}\{([^}]*)\}", r"\1/\2", latex_string)

    # Remove tildes (~) for spaces
    latex_string = latex_string.replace("~", " ")
    latex_string = latex_string.replace(r"\degree", "°")
    latex_string = latex_string.replace(r"\ell", "ℓ")
    latex_string = latex_string.replace(r"\cdots", "…")
    latex_string = latex_string.replace(r"\times", "x")
    latex_string = latex_string.replace(r"\%", "%")
    latex_string = latex_string.replace(r"\lparen", "(")
    latex_string = latex_string.replace(r"\rparen", ")")

    return latex_string

def format_frequency_table(table_data):
    """Formats the given frequency table data into a readable string."""
    lines = table_data.split('\n')
    formatted_lines = []

    for line in lines:
        line = line.strip()
        if line:  # Skip empty lines
            parts = line.replace(r'\quad', ' ').split()  # Split by spaces after replacing \quad
            parts = [part.strip().replace(r'\sim', '~') for part in parts] #remove extra space and replace \sim
            formatted_line = "  ".join(parts) #add double space between each part.
            formatted_lines.append(formatted_line)

    return "\n".join(formatted_lines)

def remove_markup_text(text):
    """Removes \textbf{...} from a string, keeping the text inside."""
    """Removes \text{...} from a LaTeX string, keeping the letter inside."""
    textSub = re.sub(r"\\text\{([^}]*)\}", r"\1", text)
    textSub = re.sub(r"\\textbf\{([^}]*)\}", r"\1", textSub)
    textSub = textSub.replace(r"\raisebox{5pt}{$\underline{  }$}", "-")
    textSub = re.sub(r"\\boxed\{([^}]*)\}", r"\1", textSub)

    return textSub

def convert_latex_table_to_text(html_content):
    """Converts the given LaTeX table HTML to a text table."""

    # Remove LaTeX commands and table structure
    text = re.sub(r"\\def\\arraystretch\{.*?\}", "", html_content) #remove def arraystretch
    text = re.sub(r"\\begin\{array\}\{\|c\|c\|\}\s*\\hline", "", text)
    text = re.sub(r"\\end\{array\}", "", text)
    text = re.sub(r"\\hline", "", text)

    # Replace & with | and \\ with \n
    text = text.replace("&", "|")
    text = text.replace(r"\\", "\n")

    # Remove extra spaces and newlines
    text = "\n".join(line.strip() for line in text.splitlines() if line.strip())

    return text




def convert_csv_html_to_text(file_path, output_path):
    """Reads a CSV file, converts HTML cells to text, and saves to a new CSV."""
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return
    except Exception as e:
        print(f"An unexpected error occurred reading the csv file: {e}")
        return

    for col in df.columns:
        if df[col].dtype == object:  # Check if the column contains strings (potential HTML)
            df[col] = df[col].apply(extract_text_from_html)

    # Handle empty item_id
    for index, row in df.iterrows():
        if pd.isna(row['item_id']) or row['item_id'] == "":
            if index > 0:
                df.at[index, 'question_type' ] = df.at[index - 1, 'question_type']
                df.at[index, 'item_id'] = df.at[index - 1, 'item_id']

    # Handle empty item_description, question_content, explanation by looking up previous matching item_id
    for index, row in df.iterrows():
        if pd.isna(row['item_description']) or pd.isna(row['question_content']) or pd.isna(row['explanation']) or \
            row['item_description'] == "" or row['question_content'] == "" or row['explanation'] == "":
            item_id = row['item_id']
            if pd.notna(item_id):
                previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
                if not previous_rows.empty:
                    previous_row = previous_rows.iloc[-1]  # Get the last matching previous row
                    if pd.isna(row['item_description']) or row['item_description'] == "":
                        df.at[index, 'item_description'] = previous_row['item_description']
                    if pd.isna(row['question_content']) or row['question_content'] == "":
                        df.at[index, 'question_content'] = previous_row['question_content']
                    if pd.isna(row['explanation']) or row['explanation'] == "":
                        df.at[index, 'explanation'] = previous_row['explanation']

            
    try:
         df.to_csv(output_path, index=False, encoding='utf-8-sig') #encoding for excel
         print(f"Successfully converted and saved to {output_path}")
    except Exception as e:
        print(f"Error saving to CSV file: {e}")

def csv_to_json_array_to_file(file_path, output_file_path):
    """Reads a CSV file, converts it to a JSON array, removes records with correct_option = FALSE, and exports to a JSON file."""
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        return {"error": f"File not found at {file_path}"}
    except Exception as e:
        return {"error": f"An unexpected error occurred reading the csv file: {e}"}

    # Remove records where correct_option is FALSE (case-insensitive)
    df = df[df['correct_option'].astype(str).str.lower() != 'false']
    # Fill empty cells with empty strings
    df = df.fillna("")
    # Convert DataFrame to a list of dictionaries (JSON-like)
    json_array = df.to_dict(orient='records')

    try:
        # Convert the list of dictionaries to a JSON string
        json_string = json.dumps(json_array, ensure_ascii=False, indent=4)

        # Write the JSON string to a file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(json_string)

        return {"success": f"JSON array exported to {output_file_path}"}

    except Exception as e:
        return {"error": f"An error occurred converting to JSON or writing to file: {e}"}


In [36]:
# Example usage:
input_file = "file.csv"  # Replace with your input file path
output_file = "output.csv" # Replace with your output file path
output_file_json = "output.json" # Replace with your output file path
convert_csv_html_to_text(input_file, output_file)
csv_to_json_array_to_file(output_file, output_file_json)


CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
CLOZE_MATH
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ
MCQ

  soup = BeautifulSoup(html_content, 'html.parser')
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  previous_rows = df.loc[:index - 1][df['item_id'] == item_id]
  p

{'success': 'JSON array exported to output.json'}

In [37]:
import json
import random

def split_json_array_to_files(input_file_path, training_file_path="training_set.json", test_file_path="test_set.json", split_ratio=0.7):
    """
    Reads a JSON array from a file, splits it into training and test sets, and saves them to separate files.

    Args:
        input_file_path (str): Path to the input JSON file.
        training_file_path (str): Path to save the training set JSON.
        test_file_path (str): Path to save the test set JSON.
        split_ratio (float): Ratio of training set size to total size (e.g., 0.7 for 70%).
    """
    try:
        with open(input_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        return {"error": f"File not found at {input_file_path}"}
    except json.JSONDecodeError:
        return {"error": f"Invalid JSON format in {input_file_path}"}
    except Exception as e:
        return {"error": f"An unexpected error occurred reading the JSON file: {e}"}

    # random.shuffle(data)  # Shuffle the data to ensure random split

    split_index = int(len(data) * split_ratio)
    training_set = data[:split_index]
    test_set = data[split_index:]

    try:
        with open(training_file_path, 'w', encoding='utf-8') as f:
            json.dump(training_set, f, ensure_ascii=False, indent=4)

        with open(test_file_path, 'w', encoding='utf-8') as f:
            json.dump(test_set, f, ensure_ascii=False, indent=4)

        return {"success": f"JSON array split and saved to {training_file_path} and {test_file_path}"}

    except Exception as e:
        return {"error": f"An error occurred writing to JSON files: {e}"}


In [38]:
input_file_path = "output.json"
split_json_array_to_files(input_file_path)


{'success': 'JSON array split and saved to training_set.json and test_set.json'}