In [7]:
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib notebook

Path.ls = lambda x: [o for o in x.iterdir()]

In [28]:
data_dirname = Path('../data/raw/')
processed_dirname = Path('../data/processed')
processed_dirname.mkdir(exist_ok=True, parents=True)

In [29]:
def print_df_stats(df: pd.DataFrame):
    """
    Print some basic info about the dataframe.
    """
    count = df.shape[0]  # n rows
    n_latex_equations = df['latex'].apply(len)
    n_latex_equations = n_latex_equations.sum()  # count the total number of latex equations
    print(f'{count} rows from with {n_latex_equations} latex equations.')


def combine_data_from_sources(file_type: str):
    """
    Find all of the .csv files in the data directory and combine them all together into a large dataframe.
    Assign a new column which tells me which forum each row came from, i.e. math, astronomy, etc.
    Assign a new column which tells me what kind of text this is, i.e. Post, Comment, etc.
    Useful when working with more than one stack exchange forum.
    """
    assert file_type in ('Posts', 'Comments')
    df_chunks = []
    for data_dir in data_dirname.rglob('*.com'):
        for json_filename in data_dir.rglob(f'{file_type}.json'):
            file_type = json_filename.with_suffix('').name  # Posts, Comments, etc.
            df = pd.read_json(json_filename)
            df['source'] = data_dir.name  # physics.stackexchange.com, biology.stackexchange.com, ...
            df['type'] = file_type
            print(json_filename)
            print_df_stats(df)
            df_chunks.append(df)
            save_dirname = processed_dirname/data_dir.name
            save_dirname.mkdir(exist_ok=True, parents=True)
            df.to_json(save_dirname/json_filename.name)
    df = pd.concat(df_chunks, axis=0, ignore_index=True)
    return df

In [30]:
file_type = 'Posts'
posts = combine_data_from_sources(file_type)
posts.to_csv(processed_dirname/'Posts.csv')

../data/raw/physics.stackexchange.com/Posts.json
345199 rows from with 1339195 latex equations.
../data/raw/chemistry.stackexchange.com/Posts.json
66387 rows from with 198968 latex equations.
../data/raw/biology.stackexchange.com/Posts.json
48853 rows from with 12554 latex equations.


In [31]:
file_type = 'Comments'
comments = combine_data_from_sources(file_type)
comments.to_csv(processed_dirname/'Comments.csv')

../data/raw/physics.stackexchange.com/Comments.json
675300 rows from with 248766 latex equations.
../data/raw/chemistry.stackexchange.com/Comments.json
108799 rows from with 22883 latex equations.
../data/raw/biology.stackexchange.com/Comments.json
84776 rows from with 1154 latex equations.


In [32]:
def row_to_labeled_equation(row: pd.Series):
    for equation in row['latex']:
        yield {
            'source': row['source'],
            'post_id': row['Id'],
            'score': row['Score'],
            'user_id': row.get('UserId', row.get('OwnerUserId')),
            'type': row['type'],
            'equation': equation

        }

In [33]:
equations = []
_ = comments.apply(lambda row: [equations.append(e) for e in row_to_labeled_equation(row)], axis=1)
_ = posts.apply(lambda row: [equations.append(e) for e in row_to_labeled_equation(row)], axis=1)

In [34]:
eq_df = pd.DataFrame(equations)

In [35]:
eq_df.sample(10)

Unnamed: 0,equation,post_id,score,source,type,user_id
1385277,A,60453,1,physics.stackexchange.com,Posts,20747.0
1271912,S,464449,0,physics.stackexchange.com,Posts,224501.0
1770031,\Psi_{3},8928,12,chemistry.stackexchange.com,Posts,189.0
1565969,in the dual,116470,2,physics.stackexchange.com,Posts,955.0
69399,\lambda_{;bc}^{x}-\lambda_{;cb}^{x}=R_{\phanto...,396245,0,physics.stackexchange.com,Comments,4075.0
1472514,"(\hat{x}_i,\hat{p}_i)",86944,2,physics.stackexchange.com,Posts,21487.0
774649,T_2,305308,2,physics.stackexchange.com,Posts,141740.0
1791283,\pu{0.020 mol},111845,0,chemistry.stackexchange.com,Posts,34226.0
345100,"\vec{i},\vec{j},\vec{k}",159561,6,physics.stackexchange.com,Posts,43351.0
84043,W,463089,0,physics.stackexchange.com,Comments,86781.0


In [36]:
eq_df.equation.apply(len).hist(bins=np.arange(0,128))
plt.xlabel('N characters')

<IPython.core.display.Javascript object>

Text(0.5,0,'N characters')

In [47]:
keep_condition = (
    (eq_df.equation.apply(len) > 2) &
    (~eq_df.equation.isnull())
    # (eq_df.equation.apply(lambda x: '\\ce' not in x))
)
print(f'Keeping {keep_condition.mean()*100:0.2f}% of the data, {keep_condition.sum()} total rows')

Keeping 79.91% of the data, 1457193 total rows


In [48]:
path = processed_dirname/'equations.csv'
print(f'Saving to {path}')
print('Null data:', eq_df[keep_condition].equation.isnull().sum())
eq_df[keep_condition].to_csv(path, index=False)

Saving to ../data/processed/equations.csv
Null data: 0
