In [1]:
from pathlib import Path
import pandas as pd

In [2]:
data_dirname = Path('../data/raw/')
processed_dirname = Path('../data/processed')
processed_dirname.mkdir(exist_ok=True, parents=True)

In [3]:
def print_df_stats(df: pd.DataFrame):
    """
    Print some basic info about the dataframe.
    """
    count = df.shape[0]  # n rows
    n_latex_equations = df['latex'].apply(len)
    n_latex_equations = n_latex_equations.sum()  # count the total number of latex equations
    print(f'{count} rows from with {n_latex_equations} latex equations.')


def combine_data_from_sources(file_type: str):
    """
    Find all of the .csv files in the data directory and combine them all together into a large dataframe.
    Assign a new column which tells me which forum each row came from, i.e. math, astronomy, etc.
    Assign a new column which tells me what kind of text this is, i.e. Post, Comment, etc.
    Useful when working with more than one stack exchange forum.
    """
    assert file_type in ('Posts', 'Comments')
    df_chunks = []
    for data_dir in data_dirname.rglob('*.com'):
        for json_filename in data_dir.rglob(f'{file_type}.json'):
            file_type = json_filename.with_suffix('').name  # Posts, Comments, etc.
            df = pd.read_json(json_filename)
            df['source'] = data_dir.name
            df['type'] = file_type
            print(json_filename)
            print_df_stats(df)
            df_chunks.append(df)
            save_dirname = processed_dirname/data_dir.name
            save_dirname.mkdir(exist_ok=True, parents=True)
            df.to_json(save_dirname/json_filename.name)
    df = pd.concat(df_chunks, axis=0, sort=False, ignore_index=True)
    return df

In [4]:
file_type = 'Posts'
posts = combine_data_from_sources(file_type)
posts.to_csv(processed_dirname/'Posts.csv')

../data/raw/math.stackexchange.com/Posts.json
2637841 rows from with 23162186 latex equations.


In [5]:
file_type = 'Comments'
comments = combine_data_from_sources(file_type)
comments.to_csv(processed_dirname/'Comments.csv')

../data/raw/math.stackexchange.com/Comments.json
4769204 rows from with 5677017 latex equations.


In [6]:
comments.latex.isnull().sum()

0