# Analyze Corpus

(C) 2023-2024 by [Damir Cavar](http://damir.cavar.me/)

This notebook provides an overview of the data in the corpus by language and type.

This code is part of the [NLP-Lab](http://nlp-lab.org/) [Ellipsis Project](https://nlp-lab.org/ellipsis/).

The prerequisites for this code to run are:

- install the requirements using `pip install -r requirements.txt`

In [None]:
!pip install -r requirements.txt

In [21]:
import os
import regex as re
import pandas as pd
from ipydatagrid import DataGrid
import glob

In [22]:
re_entry = re.compile(r"\n+(?P<ellipsis>.+)\n^----\n(?P<fullform>^.+)\n(^#.+\n)*", re.MULTILINE|re.UNICODE)

In [24]:
def parse_txt_format(filename, debug=False):
    counter = 0
    with open(filename, mode='r', encoding='utf-8') as ifp:
        text = ifp.read()
    if debug:
        print(text)
    for match in re_entry.finditer(text):
        counter += 1
    return counter

In [25]:
directory = os.path.join("..", "data", "*.txt")
res = []
lang = "Russian"
for file in glob.glob(directory):
    print(file)
    DEBUG=False
    count = parse_txt_format(file, debug=DEBUG)
    if count > 0:
        res.append( (os.path.splitext(os.path.basename(file))[0], count) )
res.sort(key=lambda x: x[1], reverse=True)
total = sum([ x[1] for x in res ])
res.append( ("total", total ) )
df = pd.DataFrame(res, columns=("type", "count"), index=None)

../data/stripping.txt
../data/NP_ellipsis.txt
../data/VP_ellipsis.txt
../data/VP_ellipsis_2.txt
../data/gapping_2.txt
../data/gapping.txt
../data/sluicing.txt
../data/verb_stranding.txt
../data/sluicing_2.txt
../data/polarity_ellipsis.txt
../data/NP_ellipsis_2.txt


In [26]:
grid = DataGrid(df)
grid.auto_fit_params = {"area": "body", "padding": 80, "numCols": 2}
grid.auto_fit_columns = True
grid

DataGrid(auto_fit_columns=True, auto_fit_params={'area': 'body', 'padding': 80, 'numCols': 2}, corner_renderer…