In [1]:
from pathlib import Path
import pandas as pd

In [2]:
data_dirname = Path('../data/raw/')
processed_dirname = Path('../data/processed')
processed_dirname.mkdir(exist_ok=True, parents=True)

In [7]:
def print_df_stats(df: pd.DataFrame):
    """
    Print some basic info about the dataframe.
    """
    dates = df.CreationDate.apply(lambda x: x.date())  # get year-month-day
    min_date = dates.min()
    max_date = dates.max()
    count = df.shape[0]  # n rows
    n_latex_equations = df['latex'].apply(len)
    n_latex_equations = n_latex_equations.sum()  # count the total number of latex equations
    print(f'{count} rows from {min_date} -> {max_date}, with {n_latex_equations} latex equations.')


def combine_data_from_sources(file_type: str):
    """
    Find all of the .csv files in the data directory and combine them all together into a large dataframe.
    Assign a new column which tells me which forum each row came from, i.e. math, astronomy, etc.
    Assign a new column which tells me what kind of text this is, i.e. Post, Comment, etc.
    Useful when working with more than one stack exchange forum.
    """
    assert file_type in ('Posts', 'Comments')
    df_chunks = []
    for data_dir in data_dirname.rglob('*.com'):
        for csv_filename in data_dir.rglob(f'{file_type}.csv'):
            file_type = csv_filename.with_suffix('').name  # Posts, Comments, etc.
            df = pd.read_csv(csv_filename, parse_dates=['CreationDate'])
            df['source'] = data_dir.name
            df['type'] = file_type
            print(csv_filename)
            print_df_stats(df)
            df_chunks.append(df)
            save_dirname = processed_dirname/data_dir.name
            save_dirname.mkdir(exist_ok=True, parents=True)
            df.to_csv(save_dirname/csv_filename.name)
    df = pd.concat(df_chunks, axis=0, sort=False, ignore_index=True)
    return df

In [None]:
file_type = 'Posts'
posts = combine_data_from_sources(file_type)
posts.to_csv(processed_dirname/'Posts.csv', index=False)

../data/raw/math.stackexchange.com/Posts.csv
2637841 rows from 2010-03-27 -> 2019-06-02, with 531304261 latex equations.


In [None]:
file_type = 'Comments'
comments = combine_data_from_sources(file_type)
comments.to_csv(processed_dirname/'Comments.csv', index=False)