# Analyze Corpus

(C) 2023 by [Damir Cavar](http://damir.cavar.me/)

This notebook provides an overview of the data in the corpus by language and type.

This code is part of the [NLP-Lab](http://nlp-lab.org/) [Ellipsis Project](https://nlp-lab.org/ellipsis/).

The prerequisites for this code to run are:

- install the requirements using `pip install -r requirements.txt`

In [9]:
import os
import regex as re
import pandas as pd
from ipydatagrid import DataGrid
import langcodes

In [10]:
re_entry = re.compile(r"\n+(?P<ellipsis>.+)\n^----\n(?P<fullform>^.+)\n(^#.+\n)*", re.MULTILINE) # |re.UNICODE

In [11]:
def parse_txt_format(filename, debug=False):
    if "ellipsis" in filename:
        with open(filename, mode='r', encoding='utf-8') as ifp:
            text = ifp.read()
        if debug:
            print(text)
        counter = 0
        for match in re_entry.finditer(text):
            #s = match.start()
            #e = match.end()
            counter += 1
        return counter
    return 0

In [12]:
directory = "corpus"

langdirs = [ x for x in os.listdir(directory) if len(x) == 3 ]
res = []
for lfolder in langdirs:
    lang = langcodes.Language.get(lfolder).display_name()
    if lfolder == "chn":
        print(lang)
        lang = "Mandarin Chinese"
    #if lang == "Arabic":
    #    print("lang", lang)
    if lang.startswith("Unknown"): lang = lfolder
    for subdir, dirs, files in os.walk(os.path.join(directory, lfolder)):
        for file in files:
            if file.endswith(".txt"):
                full_path = os.path.join(subdir, file)
                DEBUG=False
                #if lang=="Arabic": DEBUG=True
                count = parse_txt_format(full_path, debug=DEBUG)
                if count > 0:
                    # print(lang, os.path.splitext(os.path.basename(full_path))[0], count)
                    res.append( (lang, os.path.splitext(os.path.basename(full_path))[0], count) )
            elif file.endswith(".xml"):
                pass
res.sort(key=lambda x: x[2], reverse=True)
df = pd.DataFrame(res, columns=("lang", "type", "count"), index=None)
bylang = {}
for r in res:
    bylang[r[0]] = bylang.get(r[0], 0) + r[2]
bylang = list(bylang.items())
bylang.sort(key=lambda x: x[1], reverse=True)
df_bylang = pd.DataFrame(bylang, columns=("lang", "count"), index=None)

Chinook Jargon


In [13]:
grid = DataGrid(df)
grid.auto_fit_params = {"area": "body", "padding": 80, "numCols": 2}
grid.auto_fit_columns = True
grid

DataGrid(auto_fit_columns=True, auto_fit_params={'area': 'body', 'padding': 80, 'numCols': 2}, corner_renderer…

In [14]:
DataGrid(df_bylang)

DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

In [15]:
print(df_bylang)

                lang  count
0            English    569
1             Arabic    375
2            Russian    161
3            Spanish    144
4           Japanese    113
5              Hindi    112
6             Polish     91
7            Kumaoni     84
8          Ukrainian     81
9             German     79
10            Korean     40
11  Mandarin Chinese     40
12          Gujarati      9
13           Swedish      8
14            Navajo      8
15          Croatian      6


In [16]:
2**7


128