## Build each data processing piece

In [2]:
from pathlib import Path
import subprocess

In [3]:
raw_datadir = Path('../data/raw/')

In [4]:
filepath = raw_datadir / 'math.stackexchange.com.7z'
print(filepath)

../data/raw/math.stackexchange.com.7z


In [5]:
def get_output_dirname(path):
    filename = path.name
    return path.with_suffix('')
    
def open_7z(path):
    output_dirname = get_output_dirname(path)
    output_dirname.mkdir(exist_ok=True, parents=True)
    exe_str = f"7za e {path} -o{output_dirname}"
    
    print(f'Reading data from {path}')
    print(f'Saving data to {output_dirname}')
    print("Running:", exe_str)
    p = subprocess.call(exe_str, shell=True)
    print(f'Done. {p}')
    return output_dirname

In [88]:
open_7z(filepath)

Reading data from ../data/raw/math.stackexchange.com.7z
Saving data to ../data/raw/math.stackexchange.com
Running: 7za e ../data/raw/math.stackexchange.com.7z -o../data/raw/math.stackexchange.com
Done. 0


In [89]:
!ls ../data/raw/math.stackexchange.com

Badges.xml      PostHistory.xml Posts.xml       Users.xml
Comments.xml    PostLinks.xml   Tags.xml        Votes.xml


In [6]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_file(path):
    """Read the xml file from path and return a pandas dataframe with all the data."""
    root = ET.parse(path).getroot()
    rows = root.findall('row')
    return pd.DataFrame([dict(row.items()) for row in rows])

In [25]:
df = parse_xml_file('../data/raw/math.stackexchange.com/Comments.xml')

In [27]:
df.head()

Unnamed: 0,CreationDate,Id,PostId,Score,Text,UserDisplayName,UserId
0,2010-07-20T19:24:44.963,2,9,6,Fantastic answer!,,10
1,2010-07-20T19:26:09.357,3,9,6,"I like this so far, but maybe add a bit on unc...",,16
2,2010-07-20T19:27:07.983,5,17,1,"Oh, I figured as so. I remember learning about...",,40
3,2010-07-20T19:28:16.077,8,17,1,It is much better than an approximation: it gi...,,55
4,2010-07-20T19:29:17.900,10,20,3,You've missed out many types of number: comple...,,35


In [7]:
import re
double_dollar = re.compile(r'\$\$(.*?)\$\$')
single_dollar = re.compile(r'\$(.*?)\$')

def search_for_latex(text):
    singles = re.findall(single_dollar, text)
    doubles = re.findall(double_dollar, text)
    return [m for m in singles+doubles if m != '']
    
math = "abc $$123$$ 456$\\frac{1}{b}$"
re.findall(single_dollar, math)

['', '', '\\frac{1}{b}']

In [61]:
df['latex'] = df.Text.apply(search_for_latex)
df['has_latex'] = df.latex.apply(len)!=0

In [62]:
df[has_latex].head(10)

Unnamed: 0,CreationDate,Id,PostId,Score,Text,UserDisplayName,UserId,latex,has_latex
3,2010-07-20T19:28:16.077,8,17,1,It is much better than an approximation: it gi...,,55,[n],True
12,2010-07-20T19:41:44.767,20,17,6,"By the way, if you want an approximation, you ...",,55,"[|1-\varphi|<1, F(n)={\varphi^n \over {\sqrt 5}}]",True
137,2010-07-21T00:23:24.567,201,194,3,Yes (as has been answered): the heuristic reas...,,536,"[p_1, p_2, \dots, p_n, (1-1/p_n), \prod ( 1 - ...",True
272,2010-07-21T13:11:06.600,439,310,0,What do you mean by $T \subset Z_{10}$? In fac...,,123,"[T \subset Z_{10}, T]",True
276,2010-07-21T13:30:00.707,445,310,0,"Ack, I meant $Z_{15}$. $T$ is the subring of $...",,536,"[Z_{15}, T, Z_{15}]",True
283,2010-07-21T13:54:43.813,455,310,0,Precisely (to give a homomorphism from $Z_n$ t...,,536,"[Z_n, n]",True
292,2010-07-21T14:40:10.350,469,135,2,I find the following pair of wolfram alpha plo...,,167,[x^y],True
332,2010-07-21T19:22:58.920,525,368,7,$\Gamma(n+1)$... does it count? :p,,171,[\Gamma(n+1)],True
334,2010-07-21T19:38:24.607,530,374,0,"Really, I'm looking for something whose comple...",,38,[O\left( n\right)],True
352,2010-07-21T21:11:11.357,559,363,0,How come you can write $x^p + y^p$ as $\prod (...,,123,"[x^p + y^p, \prod (x+\zeta_p^i)]",True


## Put it all together

In [8]:
data_dirname = Path('../data/raw/')

In [16]:
columns = {
        'Posts.xml': 'Body',
        'Comments.xml': 'Text'
          }
for raw_datafile in data_dirname.rglob('*.7z'):  # open each .7z file
    output_dirname = open_7z(raw_datafile)
    for xml_file in output_dirname.rglob('*.xml'):  # find all .xml files
        if xml_file.name in ('Posts.xml', 'Comments.xml'):  # Probably only want the Posts/Comments
            print(xml_file)
            df = parse_xml_file(xml_file)
            text_column = columns[xml_file.name]
            df['latex'] = df[text_column].apply(search_for_latex)
            json_filename = xml_file.with_suffix('.json')
            df.to_json(json_filename)
            print(f'Wrote to {json_filename}')

Reading data from ../data/raw/math.stackexchange.com.7z
Saving data to ../data/raw/math.stackexchange.com
Running: 7za e ../data/raw/math.stackexchange.com.7z -o../data/raw/math.stackexchange.com
Done. 134
../data/raw/math.stackexchange.com/Comments.xml
Wrote to ../data/raw/math.stackexchange.com/Comments.json
../data/raw/math.stackexchange.com/Posts.xml


OSError: [Errno 22] Invalid argument

In [21]:
df.to_json(json_filename)

OSError: [Errno 22] Invalid argument