## Build each data processing piece

In [1]:
from pathlib import Path
import subprocess

Path.ls = lambda x: [o for o in x.iterdir()]

In [2]:
raw_datadir = Path('../data/raw/')

In [4]:
def get_output_dirname(path):
    filename = path.name
    return path.with_suffix('')
    
def open_7z(path):
    output_dirname = get_output_dirname(path)
    output_dirname.mkdir(exist_ok=True, parents=True)
    exe_str = f"7za e {path} -o{output_dirname}"
    
    print(f'Reading data from {path}')
    print(f'Saving data to {output_dirname}')
    print("Running:", exe_str)
    p = subprocess.call(exe_str, shell=True)
    print(f'Done. {p}')
    return output_dirname

In [5]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_file(path):
    """Read the xml file from path and return a pandas dataframe with all the data."""
    root = ET.parse(path).getroot()
    rows = root.findall('row')
    return pd.DataFrame([dict(row.items()) for row in rows])

In [6]:
import re
double_dollar = re.compile(r'\$\$(.*?)\$\$')
single_dollar = re.compile(r'\$(.*?)\$')

def search_for_latex(text):
    singles = re.findall(single_dollar, text)
    doubles = re.findall(double_dollar, text)
    return [m for m in singles+doubles if m != '']
    
math = "abc $$123$$ 456$\\frac{1}{b}$"
re.findall(single_dollar, math)

['', '', '\\frac{1}{b}']

## Put it all together

In [7]:
data_dirname = Path('../data/raw/')

In [8]:
columns = {
        'Posts.xml': 'Body',
        'Comments.xml': 'Text'
          }
for raw_datafile in data_dirname.rglob('*.7z'):  # open each .7z file
    output_dirname = open_7z(raw_datafile)
    for xml_file in output_dirname.rglob('*.xml'):  # find all .xml files
        if xml_file.name in ('Posts.xml', 'Comments.xml'):  # Probably only want the Posts/Comments
            print(xml_file)
            df = parse_xml_file(xml_file)
            text_column = columns[xml_file.name]
            df['latex'] = df[text_column].apply(search_for_latex)
            json_filename = xml_file.with_suffix('.json')
            df.to_json(json_filename)
            print(f'Wrote to {json_filename}')

Reading data from ../data/raw/chemistry.stackexchange.com.7z
Saving data to ../data/raw/chemistry.stackexchange.com
Running: 7za e ../data/raw/chemistry.stackexchange.com.7z -o../data/raw/chemistry.stackexchange.com
Done. 134
../data/raw/chemistry.stackexchange.com/Comments.xml
Wrote to ../data/raw/chemistry.stackexchange.com/Comments.json
../data/raw/chemistry.stackexchange.com/Posts.xml
Wrote to ../data/raw/chemistry.stackexchange.com/Posts.json
Reading data from ../data/raw/physics.stackexchange.com.7z
Saving data to ../data/raw/physics.stackexchange.com
Running: 7za e ../data/raw/physics.stackexchange.com.7z -o../data/raw/physics.stackexchange.com
Done. 134
../data/raw/physics.stackexchange.com/Comments.xml
Wrote to ../data/raw/physics.stackexchange.com/Comments.json
../data/raw/physics.stackexchange.com/Posts.xml
Wrote to ../data/raw/physics.stackexchange.com/Posts.json
Reading data from ../data/raw/biology.stackexchange.com.7z
Saving data to ../data/raw/biology.stackexchange.com
