In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import html
import io
import xml.etree.ElementTree as ET
from tqdm import tqdm
from xml.sax.saxutils import escape

In [2]:
folder_path = '/srv/storage/idmctal@storage1.nancy.grid5000.fr/2023/m2/adrelingyte/data_mining/blogs'

pattern = re.compile(r'(\d+)\.(\w+)\.(\d+)\.(\w+)\.(\w+)')

data = []
unicode_errors_list = []
parsing_errors_list = []

def replace_ampersand(text):
    # Replace & with &amp; if it's not already in the &amp; format
    return re.sub(r'&(?!amp;)', '&amp;', text)

def encode_lt_gt_except(text, exceptions):
    parts = []
    in_exception = False

    for char in text:
        if in_exception:
            if char == '>':
                in_exception = False
                parts.append('>')
            else:
                parts.append(char)
        else:
            if char == '<':
                for exception in exceptions:
                    if text.startswith(exception, len(parts)):
                        in_exception = True
                        parts.append('<')
                        break
                else:
                    parts.append('&lt;')
            elif char == '>':
                parts.append('&gt;')
            else:
                parts.append(char)

    return ''.join(parts)

exceptions = ["<post>", "<date>", "<Blog>", "</post>", "</date>", "</Blog>"]

file_list = [file for file in os.listdir(folder_path) if re.match(pattern, file)]

for file_name in tqdm(file_list, desc="Processing XML files"):
    match = re.match(pattern, file_name)

    try:
        number, gender, age, genre, zodiac = match.groups()
        xml_file_path = os.path.join(folder_path, file_name)
        with open(xml_file_path, 'r', encoding='utf-8') as file:
            xml_content = file.read()

        # Apply encoding before replacing & with &amp;
        modified_xml_content = encode_lt_gt_except(xml_content, exceptions)
        modified_xml_content = replace_ampersand(modified_xml_content)

        tree = ET.ElementTree(ET.fromstring(modified_xml_content))
        root = tree.getroot()

        dates = [date.text for date in root.findall(".//date")]
        posts = [post.text for post in root.findall(".//post")]

        for date, post in zip(dates, posts):
            data.append({
                "Number": number,
                "Gender": gender,
                "Age": age,
                "Post Genre": genre,
                "Zodiac Sign": zodiac,
                "Date": date,
                "Post": post
            })

    except ET.ParseError:
        parsing_errors_list.append(xml_file_path)
        continue
    except UnicodeDecodeError as decode_error:
        unicode_errors_list.append(xml_file_path)
        continue


Processing XML files:   0%|          | 26/18266 [00:00<01:29, 202.91it/s]

Processing XML files: 100%|██████████| 18266/18266 [01:22<00:00, 220.22it/s]


In [3]:
df = pd.DataFrame(data)
df.shape

(474213, 7)

In [None]:
df