In [14]:
! brew install p7zip

To reinstall 16.02_1, run `brew reinstall p7zip`


In [15]:
from pathlib import Path
import subprocess
import os

In [16]:
raw_datadir = Path('../data/raw/')

In [17]:
filepath = raw_datadir / 'biology.stackexchange.com.7z'
print(filepath)

../data/raw/biology.stackexchange.com.7z


In [18]:
def get_output_dirname(path):
    filename = path.name
    return path.with_suffix('')
    
def open_7z(path):
    output_dirname = get_output_dirname(path)
    output_dirname.mkdir(exist_ok=True, parents=True)
    exe_str = f"7za e {path} -o{output_dirname}"
    
    print(f'Reading data from {path}')
    print(f'Saving data to {output_dirname}')
    print("Running:", exe_str)
    p = subprocess.call(exe_str, shell=True)
    print(f'Done. {p}')
    return 

In [19]:
open_7z(filepath)

Reading data from ../data/raw/biology.stackexchange.com.7z
Saving data to ../data/raw/biology.stackexchange.com
Running: 7za e ../data/raw/biology.stackexchange.com.7z -o../data/raw/biology.stackexchange.com
Done. 0


In [20]:
!ls ../data/raw/biology.stackexchange.com

Badges.xml      PostHistory.xml Posts.xml       Users.xml
Comments.xml    PostLinks.xml   Tags.xml        Votes.xml


In [21]:
file_list = os.listdir(raw_datadir)
for file_name in file_list:
    if os.path.splitext(file_name)[1] == '.7z':
        print (file_name)
        path = raw_datadir / file_name
        open_7z(path)


math.stackexchange.com.7z
Reading data from ../data/raw/math.stackexchange.com.7z
Saving data to ../data/raw/math.stackexchange.com
Running: 7za e ../data/raw/math.stackexchange.com.7z -o../data/raw/math.stackexchange.com
Done. 0
chemistry.stackexchange.com.7z
Reading data from ../data/raw/chemistry.stackexchange.com.7z
Saving data to ../data/raw/chemistry.stackexchange.com
Running: 7za e ../data/raw/chemistry.stackexchange.com.7z -o../data/raw/chemistry.stackexchange.com
Done. 0
physics.stackexchange.com.7z
Reading data from ../data/raw/physics.stackexchange.com.7z
Saving data to ../data/raw/physics.stackexchange.com
Running: 7za e ../data/raw/physics.stackexchange.com.7z -o../data/raw/physics.stackexchange.com
Done. 0
biology.stackexchange.com.7z
Reading data from ../data/raw/biology.stackexchange.com.7z
Saving data to ../data/raw/biology.stackexchange.com
Running: 7za e ../data/raw/biology.stackexchange.com.7z -o../data/raw/biology.stackexchange.com
Done. 134


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_file(path):
    """Read the xml file from path and return a pandas dataframe with all the data."""
    root = ET.parse(path).getroot()
    rows = root.findall('row')
    return pd.DataFrame([dict(row.items()) for row in rows])

In [None]:
def store_comments_csv(subject):
    print('parsing xml file of ' + subject + ' comments')
    subject_comment = parse_xml_file('../data/raw/'+ subject + '.stackexchange.com/Comments.xml')
    print('storing csv file of ' + subject + ' comments')
    subject_comment.to_csv("../data/csv/" + subject + "_comment.csv", index = False)
    return

In [None]:
def store_posts_csv(subject):
    print('parsing xml file of ' + subject + ' posts')
    subject_post = parse_xml_file('../data/raw/'+ subject + '.stackexchange.com/Posts.xml')
    print('storing csv file of ' + subject + ' posts')
    subject_post.to_csv("../data/csv/" + subject + "_post.csv", index = False)
    return

In [None]:
subject_list = ['math', 'physics', 'biology', 'chemistry']

In [None]:
for subject in subject_list:
    store_comments_csv(subject)
    store_posts_csv(subject)

parsing xml file of math comments
storing csv file of math comments
parsing xml file of math posts
storing csv file of math posts
