# Analyze Corpus

(C) 2024 by [Damir Cavar](http://damir.cavar.me/)

This notebook provides an overview of the data in the English sub-corpus.

This code is part of the [NLP-Lab](http://nlp-lab.org/) [The Hoosier Ellipsis Project](https://nlp-lab.org/ellipsis/).

The prerequisites for this code to run are:

- install the requirements using `pip install -r requirements.txt`

In [39]:
import os
import regex as re
import pandas as pd
from ipydatagrid import DataGrid

re_entry = re.compile(r"\n+(?P<ellipsis>.+)\n^----\n(?P<fullform>^.+)\n(^#.+\n)*", re.MULTILINE)

def parse_txt_format(filename):
    if "data" in filename:
        with open(filename, mode='r', encoding='utf-8') as ifp:
            text = ifp.read()
        counter = 0
        for match in re_entry.finditer(text):
            counter += 1
        return counter
    return 0

res = []
for subdir, dirs, files in os.walk("data"):
    for file in files:
        if file.endswith(".txt"):
            full_path = os.path.join(subdir, file)
            count = parse_txt_format(full_path)
            if count > 0:
                res.append( (os.path.splitext(os.path.basename(full_path))[0], count) )
res.sort(key=lambda x: x[1], reverse=True)
df = pd.DataFrame(res, columns=("type", "count"), index=None)
bylang = {}
for r in res:
    bylang[r[0]] = bylang.get(r[0], 0) + r[1]
bylang = list(bylang.items())
bylang.sort(key=lambda x: x[1], reverse=True)
df_bylang = pd.DataFrame(bylang, columns=("type", "count"), index=None)
Total = df_bylang['count'].sum()
print("English total:", Total)
grid = DataGrid(df)
grid.auto_fit_params = {"area": "body", "padding": 80, "numCols": 2}
grid.auto_fit_columns = True
grid

English total: 575


DataGrid(auto_fit_columns=True, auto_fit_params={'area': 'body', 'padding': 80, 'numCols': 2}, corner_renderer…