In [72]:
import re
from pydantic import BaseModel, validator, ValidationError, Field, root_validator, constr
from typing import List, Optional
import pandas as pd
from datetime import datetime

In [None]:
'''
Defines a class ContentClass for representing content data. 
It includes fields like level, title, topic, and learning_outcomes, 
with validation rules for ensuring that certain fields do not contain HTML or quote characters 
and that the level field matches a specific pattern. 
The @validator decorator is used for the topic field, while the @field_validator decorator is used for the title, 
learning_outcomes, and level fields. 
These validators help ensure that the content metadata is valid and can be used reliably in your application.
'''

In [119]:
from pydantic import BaseModel, validator
from typing import Optional
import re

class ContentClass(BaseModel):
    level: str
    title: str
    topic: str
    learning_outcomes: Optional[str] = None

    @validator('level', 'title', 'topic', 'learning_outcomes', allow_reuse=True)
    @classmethod
    def validate_no_html_or_quotes(cls, v):
        if v is not None and re.search(r"[\"'<>&]", v):
            raise ValueError("Field contains invalid characters like quotes or HTML tags")
        return v

    @validator('level')
    @classmethod
    def level_must_match_pattern(cls, v):
        if not re.match(r"Level (I|II|III)$", v):
            raise ValueError("level is not valid. Must be Level I, Level II, or Level III")
        return v


/var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/2120705955.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('level', 'title', 'topic', 'learning_outcomes', allow_reuse=True)
/var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/2120705955.py:18: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('level')


In [None]:
'''
defines functions for parsing XML files, cleaning text, processing files, 
and writing the processed data to a CSV file. It uses xml.etree.ElementTree to parse XML files, 
extracts paragraph elements, and creates a Pandas DataFrame with the extracted data.
The clean_text function removes unwanted characters from text. 
The process_files function iterates over a list of file paths, titles, 
and levels, calls parse_xml_files for each file, cleans the text, and concatenates 
the results into a single DataFrame.
'''

In [97]:
import xml.etree.ElementTree as ET
import pandas as pd
import re
import csv

def parse_xml_files(file_path, title, level):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Namespace extracted dynamically if needed
    namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}

    data = []
    # Iterate through paragraph elements
    for elem in root.findall('.//tei:p', namespaces=namespace):
        # Extract text, including nested elements
        paragraph = ''.join(elem.itertext())
        data.append({
            'level': level,
            'title': title,
            'topic': title,  # Assuming the title serves as the topic
            'learning_outcomes': paragraph
        })

    return pd.DataFrame(data)

def clean_text(text):
    # Clean the text by removing unwanted characters
    return re.sub(r'[\'"‘’”“<>]', '', str(text))

def process_files(file_paths, titles, levels):
    all_data = []
    for file_path, title, level in zip(file_paths, titles, levels):
        df = parse_xml_files(file_path, title, level)
        all_data.append(df)

    # Concatenate all dataframes
    final_df = pd.concat(all_data, ignore_index=True)
    # Apply text cleaning
    final_df['learning_outcomes'] = final_df['learning_outcomes'].apply(clean_text)
    return final_df

def write_to_csv(df, file_path):
    # Write the DataFrame to a CSV file
    df.to_csv(file_path, index=False, quoting=csv.QUOTE_ALL)


file_paths = [
    "/Users/Aneesh/Downloads/2024-l1-topics-combined-2.pdf.tei (1).xml",
    "/Users/Aneesh/Downloads/2024-l2-topics-combined-2.pdf.tei (1).xml",
    "/Users/Aneesh/Downloads/2024-l3-topics-combined-2.pdf.tei (1).xml"
]
titles = ["Derivatives", "Quantitative Methods", "Economics"]
levels = ["Level I", "Level II", "Level III"]

# Process files
final_df = process_files(file_paths, titles, levels)

# Write to CSV
csv_file_path = "/Users/Aneesh/Downloads/Clean CSV1/content_data.csv"
write_to_csv(final_df, csv_file_path)

In [101]:
pip install ipytest


Collecting ipytest
  Downloading ipytest-0.14.0-py3-none-any.whl.metadata (15 kB)
Downloading ipytest-0.14.0-py3-none-any.whl (14 kB)
Installing collected packages: ipytest
Successfully installed ipytest-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
#Test cases

In [102]:
import ipytest

In [120]:
import pytest

# Test case to ensure title field does not accept HTML or quote characters
def test_title_contains_html_or_quotes_failure():
    with pytest.raises(ValueError):
        ContentClass(level="Level I", title="<script>Alert!</script>", topic="Valid Topic")

# Test case to ensure learning_outcomes field does not accept HTML or quote characters
def test_learning_outcomes_contains_html_or_quotes_failure():
    with pytest.raises(ValueError):
        ContentClass(level="Level I", title="Valid Title", topic="Valid Topic", learning_outcomes="Some outcomes<script>")

# Test case to ensure level field does not accept HTML or quote characters
def test_level_contains_html_or_quotes_failure():
    with pytest.raises(ValueError):
        ContentClass(level="<script>", title="Valid Title", topic="Valid Topic")

# Test case to ensure level field validates against a specific pattern
def test_level_wrong_pattern_failure():
    with pytest.raises(ValueError):
        ContentClass(level="Level IV", title="Valid Title", topic="Valid Topic")

# Test case to ensure topic field does not accept HTML or quote characters
def test_topic_contains_html_or_quotes_failure():
    with pytest.raises(ValueError):
        ContentClass(level="Level I", title="Valid Title", topic="<script>Alert!</script>")


In [121]:

# Test case to verify that a ContentClass object can be successfully created with valid data
def test_valid_content_creation_success():
    obj = ContentClass(level="Level II", title="Valid Title", topic="Valid Topic", learning_outcomes="Some outcomes")
    assert obj is not None

# Test case to verify that the learning_outcomes field is optional and can be omitted
def test_missing_learning_outcomes_success():
    obj = ContentClass(level="Level II", title="Valid Title", topic="Valid Topic")
    assert obj.learning_outcomes is None

# Test case to verify that a valid level value of "Level III" is accepted
def test_level_III_success():
    obj = ContentClass(level="Level III", title="Valid Title", topic="Valid Topic", learning_outcomes="Some outcomes")
    assert obj.level == "Level III"

# Additional test case to further verify the acceptance of a valid level value "Level III"
def test_valid_level_III_success():
    obj = ContentClass(level="Level III", title="Valid Title", topic="Valid Topic")
    assert obj.level == "Level III"

# Test case to verify that the title field accepts valid input without HTML or quotes
def test_valid_title_without_html_or_quotes_success():
    obj = ContentClass(level="Level I", title="Valid Title", topic="Valid Topic", learning_outcomes="Valid outcomes")
    assert obj.title == "Valid Title"

In [122]:
ipytest.run('-vv')

platform darwin -- Python 3.10.9, pytest-7.1.2, pluggy-1.0.0 -- /Users/Aneesh/anaconda3/bin/python
cachedir: .pytest_cache
rootdir: /Users/Aneesh
plugins: anyio-3.5.0, cov-4.1.0, mock-3.12.0
[1mcollecting ... [0mcollected 10 items

t_b07197e984a54c518990616af17b5d1f.py::test_title_contains_html_or_quotes_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/176117034.py [32mPASSED[0m[32m [ 10%][0m
t_b07197e984a54c518990616af17b5d1f.py::test_learning_outcomes_contains_html_or_quotes_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/176117034.py [32mPASSED[0m[32m [ 20%][0m
t_b07197e984a54c518990616af17b5d1f.py::test_level_contains_html_or_quotes_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/176117034.py [32mPASSED[0m[32m [ 30%][0m
t_b07197e984a54c518990616af17b5d1f.py::test_level_wrong_pattern_failure <- ../../var/folders/1r/zt6q31651tv0kfm5wtg7t65m0000gq/T/ipykernel_13840/176117034

<ExitCode.OK: 0>