In [None]:
import re
import warnings

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.text import Text

import requests

nltk.download('punkt')
nltk.download('stopwords')

warnings.filterwarnings('ignore', 'This pattern has match groups')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Get and prepare the text of Magna Carta:
magna_carta_response = requests.get(
  'https://raw.githubusercontent.com/' +
  'ddmitov/magna-carta/master/magna-carta.txt'
)
magna_carta_text = magna_carta_response.text

# Prepare a dictionary with the number and the text of all articles
# for labeling of the concordance lines:
magna_carta_articles_list = magna_carta_text.split('\n\n')

magna_carta_articles_dict = {}
article_number = 0

for article in magna_carta_articles_list:
  article = article.replace('\n', '')

  if len(article) > 0:
    magna_carta_articles_dict[str(article_number).zfill(2)] = article
    article_number += 1

# Split the Magna Carta text in sentences and word forms (tokens):
magna_carta_forms_list = \
  [word_tokenize(sentence) for sentence in sent_tokenize(magna_carta_text)]

In [None]:
#############################
#### CONCORDANCE KEYWORD ####
#############################

concordance_keyword_nominative_form = 'plena'
concordance_keyword_root_form = 'plen'

#############################
#### CONCORDANCE KEYWORD ####
#############################

In [None]:
# Get all available forms:
selected_forms_set = set()

for sentence in magna_carta_forms_list:
  for form in sentence:
    if re.match(
      '(^' + concordance_keyword_nominative_form + '$)' +
      '|' + 
      '(^' + concordance_keyword_root_form + '.{1,4})',
      form
    ):
      selected_forms_set.add(form)

selected_forms_sorted_list = sorted(selected_forms_set)

print('Available forms:', selected_forms_sorted_list)

Available forms: ['plenam', 'plene']


In [None]:
# Get a concordance of the selected word:
magna_carta_concordance_text = Text(word_tokenize(magna_carta_text))

for form in selected_forms_sorted_list:
  concordance_list = magna_carta_concordance_text.concordance_list(form)

  for concordance_line in concordance_list:
    concordance_line_string = str(concordance_line.line)

    for article_number, article_text in magna_carta_articles_dict.items():
      formatted_concordance_line = concordance_line_string
      formatted_concordance_line = \
        formatted_concordance_line.replace(' . ', '. ')
      formatted_concordance_line = \
        formatted_concordance_line.replace(' , ', ', ')
      formatted_concordance_line = \
        formatted_concordance_line.replace(' ; ', '; ')

      if formatted_concordance_line in article_text:
        print(
            str(article_number) +
            '. ' +
            concordance_line_string
        )

  print()

05. ejusdem ; et reddat heredi , cum ad plenam etatem pervenerit , terram suam tot
57. grinatione nostra , statim eis inde plenam justitiam exhibebimus , secundum le

02. uerit , et cum decesserit heres suus plene etatis fuerit et relevium debeat , h
62. os et laicos , a tempore discordie , plene omnibus remisimus et condonavimus . 
62. todecimo usque ad pacem reformatam , plene remisimus omnibus , clericis et laic
62.  laicis , et quantum ad nos pertinet plene condonavimus . Et insuper fecimus ei

