# Analyze Corpus

(C) 2023 by [Damir Cavar](http://damir.cavar.me/)

This notebook provides an overview of the data in the corpus by language and type.

This code is part of the [NLP-Lab](http://nlp-lab.org/) [Ellipsis Project](https://nlp-lab.org/ellipsis/).

The prerequisites for this code to run are:

- install the requirements using `pip install -r requirements.txt`

In [17]:
import os
import regex as re
import pandas as pd
from ipydatagrid import DataGrid
import langcodes

In [18]:
re_entry = re.compile(r"\n+(?P<ellipsis>.+)\n^----\n(?P<fullform>^.+)\n(^#.+\n)*", re.MULTILINE) # |re.UNICODE

In [19]:
def parse_txt_format(filename, debug=False):
    with open(filename, mode='r', encoding='utf-8') as ifp:
        text = ifp.read()
    counter = 0
    for match in re_entry.finditer(text):
        counter += 1
    return counter

In [None]:
directory = "data"
res = []
for file in os.listdir(directory):
    if file.endswith(".txt") and file != "Distractors.txt":
        full_path = os.path.join(directory, file)
        count = parse_txt_format(full_path, debug=True)
        if count > 0:
            # print(lang, os.path.splitext(os.path.basename(full_path))[0], count)
            res.append( ("ara", os.path.splitext(os.path.basename(full_path))[0], count) )
    elif file.endswith(".xml"):
        pass
res.sort(key=lambda x: x[2], reverse=True)
df = pd.DataFrame(res, columns=("lang", "type", "count"), index=None)
bylang = {}
for r in res:
    bylang[r[0]] = bylang.get(r[0], 0) + r[2]
bylang = list(bylang.items())
bylang.sort(key=lambda x: x[1], reverse=True)
df_bylang = pd.DataFrame(bylang, columns=("lang", "count"), index=None)

In [21]:
grid = DataGrid(df)
grid.auto_fit_params = {"area": "body", "padding": 80, "numCols": 2}
grid.auto_fit_columns = True
grid

DataGrid(auto_fit_columns=True, auto_fit_params={'area': 'body', 'padding': 80, 'numCols': 2}, corner_renderer…

In [22]:
DataGrid(df_bylang)

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

In [23]:
print(df_bylang)

  lang  count
0  ara    605
