<a href="https://colab.research.google.com/github/chengolivia/lexicon-maker/blob/main/vocab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vocab List Generator

## Code
Hide and press play

If you get an error with the code below, try removing the "#" from the cell and running it.

Citation: Johnson, Kyle P., Patrick J. Burns, John Stewart, Todd Cook, Clément Besnier, and William J. B. Mattingly. "The Classical Language Toolkit: An NLP Framework for Pre-Modern Languages." In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations, pp. 20-29. 2021. 10.18653/v1/2021.acl-demo.3

In [None]:
!pip install cltk
!pip install spacy
! python -m spacy download de

Collecting cltk
  Downloading cltk-1.3.0-py3-none-any.whl (695 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m695.7/695.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting boltons<22.0.0,>=21.0.0 (from cltk)
  Downloading boltons-21.0.0-py2.py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.7/193.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4.0,>=3.0 (from cltk)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting greek-accentuation<2.0.0,>=1.2.0 (from cltk)
  Downloading greek_accentuation-1.2.0-py2.py3-none-any.whl (6.8 kB)
Collecting rapidfuzz<4.0.0,>=3.4.0 (from cltk)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m

In [None]:
import string
from os import path, sep
from nltk.stem.snowball import GermanStemmer
from cltk import NLP
import spacy


DRIVE_DIRECTORY = path.join(sep, "content", "drive", "My Drive")

SUPPORTED_LANGUAGES = ["German", "Latin"]

def count_words_from_file(text_file, thresh, language=None, use_stems=False):
    file = path.join(DRIVE_DIRECTORY, text_file)
    with open(file, "r", encoding='utf-8') as f:
        text = f.read()
    text = parse_text(text)
    if use_stems:
      text_arr = split_stems(text, language)
    else:
      text_arr = text.split()
    counts = count_words(text_arr)
    return count_above_thresh(counts, thresh)

def parse_text(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.replace("\n", "")
  text = text.replace(r"\u", "")
  text = text.replace(r"\x", "")
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.digits))
  text = " ".join(text.split())
  return text

def count_words(text_arr):
    word_counts = {}
    for word in text_arr:
        if word == "":
          continue
        if word not in word_counts:
            word_counts[word] = 1
        else:
            word_counts[word] += 1
    return word_counts

def count_above_thresh(word_counts, thresh):
  ret = {}
  for word in word_counts:
    if word_counts[word] >= thresh:
      ret[word] = word_counts[word]
  return dict(sorted(ret.items())) # key=lambda i: i[0].lower()

def split_stems(text, language):
  stem_counts = {}
  if language == "German" or language == "german":
    german_nlp = spacy.load('de_core_news_sm')
    doc = german_nlp(text)
    #text_arr = text.split(' ')
    stem_arr = [token.lemma_ for token in doc]
  elif language == "Latin" or language == "latin":
    cltk_nlp = NLP(language="lat", suppress_banner=True)
    cltk_nlp.pipeline.processes.pop(-1)
    cltk_doc = cltk_nlp.analyze(text=text)
    stem_arr = cltk_doc.lemmata
  else:
    raise Exception(f"Please choose a supported language for using stems. These are {SUPPORTED_LANGUAGES}.")
  return stem_arr


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
import pandas as pd

def export_to_excel(vocab_list, file_name):
  file_name = path.join(DRIVE_DIRECTORY, file_name)
  df = pd.DataFrame.from_dict(vocab_list, "index")
  df.to_excel(f"{file_name}.xlsx")
  print("success")



def export_to_txt(vocab_list, file_name):
  file_name = path.join(DRIVE_DIRECTORY, f"{file_name}.txt")
  with open(file_name, 'a') as output:
    for key, value in vocab_list.items():
        output.write('%s:%s\n' % (key, value))
  print("success")

## Start here

**vocab_list** = count_words_from_file("**filename**", **count_threshold**)

file must be in raw text format and in google drive

you may be prompted to download files needed for NLP. type "yes" to download and ask Olivia if they look strange.

In [None]:
  vocab_list = count_words_from_file("Immanuel Kant.txt", 1, language="German", use_stems=False)
  print(vocab_list)
  print(len(vocab_list))

{'aber': 12, 'abes': 1, 'abfall': 1, 'abgaben': 1, 'abgehalten': 1, 'abgeworfen': 1, 'abwürfe': 1, 'alle': 2, 'allein': 3, 'allem': 1, 'allen': 4, 'aller': 1, 'allerdings': 1, 'allgemeine': 1, 'als': 13, 'alsdann': 1, 'also': 3, 'alten': 1, 'am': 1, 'amt': 1, 'amte': 1, 'amts': 1, 'an': 2, 'anbefohlen': 1, 'andere': 1, 'anderen': 4, 'angenommen': 1, 'angesetzt': 1, 'angestellt': 1, 'angestellter': 1, 'angetroffen': 1, 'anheischig': 1, 'anmerkungen': 1, 'ansehung': 2, 'ansieht': 1, 'antworte': 1, 'anvertrauten': 1, 'art': 1, 'arzt': 1, 'auch': 6, 'aude': 1, 'auf': 4, 'auferlegten': 1, 'aufgewiegelt': 1, 'aufkläre': 1, 'aufklärung': 6, 'aufklärungaufklärung': 1, 'aufklärungfaulheit': 1, 'auflagen': 1, 'auftrag': 1, 'aufzuwerfen': 1, 'aus': 4, 'ausgang': 1, 'ausrichtet': 1, 'ausschreibungen': 1, 'außer': 2, 'bearbeitung': 1, 'bedienen': 5, 'bedient': 1, 'bedingung': 1, 'bedrückung': 1, 'befehls': 1, 'beförderlich': 1, 'bei': 1, 'beinahe': 2, 'beispiel': 1, 'bemühen': 1, 'bequem': 1, 'beru

export_to_**filetype**(**vocab_list**, "**filename** (no extension)")

will be stored in Google Drive

In [None]:
export_to_txt(vocab_list, "Kant_Frage_vocab_list")

success


In [None]:
export_to_excel(vocab_list, "Kant_Frage_vocab_list")

success
