In [35]:
from bs4 import BeautifulSoup

def extract_text_from_html(html_content, target_class="kLnDsmZC8c49r2Ntz8LHD"):
    """
    Extracts text content from paragraphs within an HTML string,
    targeting a specific class.

    Args:
        html_content (str): The HTML string to parse.
        target_class (str): The CSS class of the paragraphs to extract.

    Returns:
        str: A string containing the extracted text, joined by newlines.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p', class_=target_class) 

    katex_spans = soup.find_all('span', class_='katex')
    for span in katex_spans:
        annotation = span.find('annotation', encoding='application/x-tex')
        if annotation:
            annotation.decompose()

    # Modify table content directly and add newlines before tables
    table_tags = soup.find_all('table')
    for table in table_tags:
        new_table_text = "\n"  # Add newline before each table
        for row in table.find_all('tr'):
            row_text = ""
            for cell in row.find_all('td'):
                row_text += cell.get_text(strip=True) + " "  # Add 1 space
            new_table_text += row_text.strip() + "\n"
        table.replace_with(BeautifulSoup(f"<br/><pre>{new_table_text.strip()}</pre>", 'html.parser')) #replace table with preformatted text

    extracted_text = soup.get_text() #extract from modified soup

    return extracted_text

html_content = r"""<div class="_2Tx0yNozDQkCgC1I9VS6VA sub-part"><b>(a)</b><div class="_33s8iDB86ShboS4mZ56Q4l"><div class="solution"><p class="kLnDsmZC8c49r2Ntz8LHD"><span>表の</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext><mo>∼</mo></mrow><annotation encoding="application/x-tex">\textbf{ア}\sim</annotation></semantics></math></span></span></span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}</annotation></semantics></math></span></span></span><span>にあてはまる数を求めなさい。ただし</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>小数第</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>3</mn></mrow><annotation encoding="application/x-tex">3</annotation></semantics></math></span></span></span><span>位を四捨五入して答えなさい。</span></p><form autocomplete="off" class="_31nUANnhZGf27LvjPN9T7j"><input class="answer_type_string" ismultiple="false" isobjective="false" type="hidden" value="[[string(latex(\textbf{ア}~=~)),type(manabie_algebra_form)],[string(latex(\textbf{イ}~=~)),type(manabie_algebra_form)],[string(latex(\textbf{ウ}~=~)),type(manabie_algebra_form)]]"/><div class="_6kWIKHssBU0OADXL8Y_nV"><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="0"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_6c3feba1_9cf1_453f_b537_ed093a168b78"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{ア}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="1"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_37005a71_22a0_4005_8cc3_78cfac9a3139"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">イ</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{イ}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="2"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_058c3526_ea1a_469e_813f_53480a6160c1"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div></div></form></div><div></div></div></div>"""
html_content = r"""<div class="question-content__1pw2-"><div class="markup"><div><div><div><div class="_33s8iDB86ShboS4mZ56Q4l"><div class="solution"><p class="kLnDsmZC8c49r2Ntz8LHD"><span>ペットボトルのキャップをくり返し投げ</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>表向きになった回数を</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>100</mn></mrow><annotation encoding="application/x-tex">100</annotation></semantics></math></span></span></span><span>回ごとに記録したところ</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>次の表のようになった。</span></p><div class="_1nTYGd66K3eOMMmXToCD4k"><table class="_4QCZHnOzWHugk6f9hv_NH" style="margin: 2em 0;;"><tbody><tr></tr><tr><td><span>投げた回数(回)</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>100</mtext></mrow><annotation encoding="application/x-tex">\text{100}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>200</mtext></mrow><annotation encoding="application/x-tex">\text{200}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>300</mtext></mrow><annotation encoding="application/x-tex">\text{300}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>400</mtext></mrow><annotation encoding="application/x-tex">\text{400}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>500</mtext></mrow><annotation encoding="application/x-tex">\text{500}</annotation></semantics></math></span></span></span><span></span></td></tr><tr><td><span>表向きになった回数(回)</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>35</mn></mrow><annotation encoding="application/x-tex">35</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>72</mn></mrow><annotation encoding="application/x-tex">72</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>102</mn></mrow><annotation encoding="application/x-tex">102</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>148</mn></mrow><annotation encoding="application/x-tex">148</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>185</mn></mrow><annotation encoding="application/x-tex">185</annotation></semantics></math></span></span></span><span></span></td></tr><tr><td><span>表向きになる相刘度数</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>0.35</mn></mrow><annotation encoding="application/x-tex">0.35</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>0.36</mn></mrow><annotation encoding="application/x-tex">0.36</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext></mrow><annotation encoding="application/x-tex">\textbf{ア}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">イ</mtext></mrow><annotation encoding="application/x-tex">\textbf{イ}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}</annotation></semantics></math></span></span></span><span></span></td></tr></tbody></table></div></div><div></div></div></div></div></div><div><div><div><div></div></div></div></div><div></div></div></div>"""
extracted_text = extract_text_from_html(html_content)

print(extracted_text)



ペットボトルのキャップをくり返し投げ,表向きになった回数を100回ごとに記録したところ,次の表のようになった。投げた回数(回) 100 200 300 400 500
表向きになった回数(回) 35 72 102 148 185
表向きになる相刘度数 0.35 0.36 ア イ ウ<br/>


In [12]:
from IPython.core.display import display, HTML

html_content = r"""<div class="_2Tx0yNozDQkCgC1I9VS6VA sub-part"><b>(a)</b><div class="_33s8iDB86ShboS4mZ56Q4l"><div class="solution"><p class="kLnDsmZC8c49r2Ntz8LHD"><span>表の</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext><mo>∼</mo></mrow><annotation encoding="application/x-tex">\textbf{ア}\sim</annotation></semantics></math></span></span></span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}</annotation></semantics></math></span></span></span><span>にあてはまる数を求めなさい。ただし</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>小数第</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>3</mn></mrow><annotation encoding="application/x-tex">3</annotation></semantics></math></span></span></span><span>位を四捨五入して答えなさい。</span></p><form autocomplete="off" class="_31nUANnhZGf27LvjPN9T7j"><input class="answer_type_string" ismultiple="false" isobjective="false" type="hidden" value="[[string(latex(\textbf{ア}~=~)),type(manabie_algebra_form)],[string(latex(\textbf{イ}~=~)),type(manabie_algebra_form)],[string(latex(\textbf{ウ}~=~)),type(manabie_algebra_form)]]"/><div class="_6kWIKHssBU0OADXL8Y_nV"><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="0"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_6c3feba1_9cf1_453f_b537_ed093a168b78"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{ア}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="1"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_37005a71_22a0_4005_8cc3_78cfac9a3139"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">イ</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{イ}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div><div class="_2gJMOj2kh45CdZrLGhEwWA h13XtabebU1GINCx5Wsog" data-index="2"><div class="_6dL7G1SbQSuxCEPsXHFlb"><div><div></div><div class="_VHcp3J5b_gI1x4Pr2aaN" id="container_input_algebra_field_058c3526_ea1a_469e_813f_53480a6160c1"><div class="_2287AZ8NwJL7m4oUOYoXy3"><label class="_3bk4iwx9g6uo_mo-TjSK5n" for="[objectObject]_[objectObject]_base"><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext><mtext></mtext><mo>=</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}~=~</annotation></semantics></math></span></span></span><span></span></label>{{response}}</div></div></div></div></div></div></form></div><div></div></div></div>"""
html_content = r"""<div class="question-content__1pw2-"><div class="markup"><div><div><div><div class="_33s8iDB86ShboS4mZ56Q4l"><div class="solution"><p class="kLnDsmZC8c49r2Ntz8LHD"><span>ペットボトルのキャップをくり返し投げ</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>表向きになった回数を</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>100</mn></mrow><annotation encoding="application/x-tex">100</annotation></semantics></math></span></span></span><span>回ごとに記録したところ</span><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo separator="true">,</mo><mtext></mtext></mrow><annotation encoding="application/x-tex">,~~</annotation></semantics></math></span></span></span><span>次の表のようになった。</span></p><div class="_1nTYGd66K3eOMMmXToCD4k"><table class="_4QCZHnOzWHugk6f9hv_NH" style="margin: 2em 0;;"><tbody><tr></tr><tr><td><span>投げた回数(回)</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>100</mtext></mrow><annotation encoding="application/x-tex">\text{100}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>200</mtext></mrow><annotation encoding="application/x-tex">\text{200}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>300</mtext></mrow><annotation encoding="application/x-tex">\text{300}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>400</mtext></mrow><annotation encoding="application/x-tex">\text{400}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext>500</mtext></mrow><annotation encoding="application/x-tex">\text{500}</annotation></semantics></math></span></span></span><span></span></td></tr><tr><td><span>表向きになった回数(回)</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>35</mn></mrow><annotation encoding="application/x-tex">35</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>72</mn></mrow><annotation encoding="application/x-tex">72</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>102</mn></mrow><annotation encoding="application/x-tex">102</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>148</mn></mrow><annotation encoding="application/x-tex">148</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>185</mn></mrow><annotation encoding="application/x-tex">185</annotation></semantics></math></span></span></span><span></span></td></tr><tr><td><span>表向きになる相刘度数</span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>0.35</mn></mrow><annotation encoding="application/x-tex">0.35</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>0.36</mn></mrow><annotation encoding="application/x-tex">0.36</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ア</mtext></mrow><annotation encoding="application/x-tex">\textbf{ア}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">イ</mtext></mrow><annotation encoding="application/x-tex">\textbf{イ}</annotation></semantics></math></span></span></span><span></span></td><td><span><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mtext mathvariant="bold">ウ</mtext></mrow><annotation encoding="application/x-tex">\textbf{ウ}</annotation></semantics></math></span></span></span><span></span></td></tr></tbody></table></div></div><div></div></div></div></div></div><div><div><div><div></div></div></div></div><div></div></div></div>"""

display(HTML(html_content))

  from IPython.core.display import display, HTML


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
投げた回数(回),100\text{100},200\text{200},300\text{300},400\text{400},500\text{500}
表向きになった回数(回),3535,7272,102102,148148,185185
表向きになる相刘度数,0.350.35,0.360.36,ア\textbf{ア},イ\textbf{イ},ウ\textbf{ウ}
