In [None]:
from google.colab import files
import PyPDF2
from docx import Document
import os
import nltk
import string
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
import matplotlib.pyplot as plt


# Обробка файлів різного формату, конвертація змісту у текст
def downloading(filename, content):
  if filename.endswith(".txt"):
      try:
        return content.decode("utf-8")
      except UnicodeDecodeError as error:
        print(f"Error decoding file {filename}.txt content:\n{error}")
        return None
  elif filename.endswith(".docx"):
    try:
      with open(filename, "wb") as file:
        file.write(content)
    except Exception as error:
      print(f"File {filename}.docx not found:\n{error}")
      return None
    doc = Document(filename)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])
  elif filename.endswith(".pdf"):
    try:
      with open(filename, "wb") as file:
        file.write(content)
    except Exception as error:
      print(f"File {filename}.pdf no found:\n{error}")
      return None
    pdfreader = PyPDF2.PdfReader(filename)
    return "\n".join([page.extract_text() for page in pdfreader.pages])
  else:
      print(f"File format is not supported: {filename}, try another one!")
      return None

# Додавання нових документів до програми
def add(texts):
  print("Attention! Supported formats: .docx / .pdf / .txt")
  print("Remark: Please, upload files with different names to avoid duplicates\nor upload files with the same names but different formats")
  uploaded = files.upload()

  if not uploaded:
    print("Files not uploaded!")
    return texts

  for filename, content in uploaded.items():
    if not len(content) == 0:
      basename = filename.split(" (")[0]
      fileexists = any(basename in item["filename"] for item in texts)
      if not fileexists:
        text = downloading(filename, content)
        if text:
          texts.append({
            "filename": filename,
            "content": text
          })
          print(f"File {filename} was uploaded!")
      else:
        print(f"File ({basename}) already exists, try another one or rename the file!")
  return texts

# Видалення документів з програми
def remove(texts):
  if not texts:
    print("No files to delete!")
    return texts

  print("\nAll documents:")
  for item in texts:
    print(item["filename"])

  try:
    filename = input("\nEnter file to delete ( example: test.txt ):\n").strip()
    counter = 0
    for i in range(len(texts)-1, -1, -1):
      if texts[i]["filename"] == filename:
        deletefile = texts.pop(i)
        print(f"File {deletefile["filename"]} deleted!")
        counter += 1
        break
    if counter == 0:
      print(f"File {filename} not exists!")
  except Exception as error:
    print(f"Error with deleting!:\n{error}")

# Вивід проміжних результатів обробки даних
def interimprint(docum):
   for i, docum in enumerate(docum, 1):
    print(f"Document {i}: {docum[:100]}")

# Обробка вхідних текстів ( токенізація, видалення пунктуації та стоп-слів )
def processing(texts, show = True):
  if not texts:
    print("Documents not found!")
    return None

  contents = [item["content"] for item in texts]

  # Токенізація
  all_tokenizedwords = []
  for i, content in enumerate(contents, 1):
    try:
      sentences = nltk.sent_tokenize(content)
    except Exception as error:
      print(f"Error with tokenization!:\n{error}")
      break
    allwords = []
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      allwords.extend(words)
    all_tokenizedwords.append(allwords)
  if show:
    print("\nProcessing stages:\n1. Tokenization:\n")
    interimprint(all_tokenizedwords)

  # Видалення пунктуації
  table = str.maketrans("", "", string.punctuation + "’‘“–”")
  withoutpunctuation = [
      [word.translate(table) for word in words if word.translate(table)] for words in all_tokenizedwords
      ]
  if show:
    print("\n2. Punctuation removal:\n")
    interimprint(withoutpunctuation)

  # Видалення стоп-слів
  stop_words = set(stopwords.words("english"))
  withoutstop = [
      [word for word in document if word.lower() not in stop_words] for document in withoutpunctuation
  ]
  if show:
    print("\n3. Stop words removal:\n")
    interimprint(withoutstop)
  return withoutstop

lastldamodel = None
lastcorpus = None
lastdictionary = None

# Тематичне моделювання ( автоматичне визначення тем )
def topicmodeling(texts):
  if not texts:
    print("Documents not found or not exists!")
    return None

  print("\nDo you want to see interim result?\n*There will be such steps: tokenization, punctuation removal, stop words removal, final topics\n\nyes - ( all of this )\nno - ( only final topics )")
  while True:
      choice = input("").strip().lower()
      if choice in ["yes", "no"]:
        break
      print("Enter yes or no, please!")

  show = (choice == "yes")

  readytokens = processing(texts, show)
  print("\nTopic Modeling:\n")

  print(f"Total number of documents: {len(texts)}\n *Recommended maximum number of topics: {len(texts) // 2}")
  while True:
    try:
      num_topics = int(input("\nEnter number of topics:\n"))
      if num_topics < 1:
        print("Enter minimum 1 topic and positive number!")
        continue
      break
    except:
      print("Enter only numbers!")
  try:
    dictionary = corpora.Dictionary(readytokens)
    corpus = [dictionary.doc2bow(document) for document in readytokens]
  except Exception as error:
    print(f"Error in the early stages of topic modeling!\n{error}")

  ldamodel = LdaModel(corpus = corpus, id2word = dictionary, num_topics = num_topics, random_state = 42, passes = 10)
  print(f"\nResults ( probabilities ):\n*Remark: the first words have the highest probability of being topics")
  for id, topic in ldamodel.print_topics():
    print(f"Topic {id+1}: {topic}")

  global lastldamodel, lastcorpus, lastdictionary
  lastldamodel = ldamodel
  lastcorpus = corpus
  lastdictionary = dictionary

# Візуалізація результатів ( ймовірностей слів )
def visualization():
  if lastldamodel is None:
    print("Topics not found! First of all, enter 3 in the menu for automatic topics detection")
    return
  print("\nHorizontal bar chart")
  while True:
    try:
      tid = int(input(f"\nTotal number of identified topics: {lastldamodel.num_topics}\nEnter topic number to plot:\n"))
      if tid < 1 or tid > lastldamodel.num_topics:
        print(f"The topic number can be in the range (1 - {lastldamodel.num_topics})")
        continue
      numwords = int(input("\nEnter number of top words for show:\n"))

      topic = lastldamodel.show_topic(tid-1, topn = numwords)
      words, probabilities = zip(*topic)

      plt.figure(figsize = (11, 5))
      plt.barh(words, probabilities, label = "Probability of topic", color = "#2673E7")

      for i, probability in enumerate(probabilities):
        plt.text(probability, i, f"{probability:.3f}")
      plt.title(f"Topic {tid}", fontsize = 13, color = "#7A26E7")
      plt.xlabel("Probabilities", fontsize = 10, color = "#18829A")
      plt.ylabel("Words", fontsize = 10, color = "#18829A")

      plt.gca().invert_yaxis()
      plt.legend()
      plt.show()
      break
    except ValueError:
        print("Enter only positive numbers!")
    except Exception as error:
        print(f"Error with visualization!\n{error}")

# Збереження результатів у зовнішньому .txt файлі
def save():
  if lastldamodel is None:
    print("Topics not found! First of all, enter 3 in the menu for automatic topics detection")
    return
  filename = input("\nEnter new file name: ( or press \"Enter\" to create name: results )\n").strip()
  if not filename:
    filename = "results"

  filename += ".txt"
  try:
    with open(filename, "w+", encoding = "utf-8") as file:
      file.write("Topic Modeling results:\n")
      for id in range(lastldamodel.num_topics):
        topic = lastldamodel.show_topic(id, topn = 10)

        file.write(f"\nTopic {id+1}:")
        for word, probability in topic:
          file.write(f"\n{word}:{probability:.4f}")
        file.write("\n")

      print(f"File {filename} with results was created!\nAttention! Before closing the program, load the finished file from the virtual disk!\n*On the left toolbar, select the file icon")

  except:
    print(f"Error with file {filename} in saving!")

# Початкове завантаження документів користувача
print("Topic Modeling in Google Colab\n\nDownload text files to start:\n\n*If you want to first view the options menu and then upload files click \"Cancel Upload\"\n")
texts = []
add(texts)

# Перегляд завантажених документів у програмі
def viewing(texts):
  print("\nAttention, the initial part of the document content is displayed!\n\nAll uploaded files:")
  if texts:
    for item in texts:
      filename = item["filename"]
      content = item["content"]
      print(f"{filename}: {content[:100]}...")
  else:
    print("Files not yet uploaded! Please enter 2 to upload in the menu")

# Функціональне меню
while True:
  try:
    option = int(input("\nEnter option number:\n1 - Viewing all uploaded files\n2 - Upload file / files to program\n3 - Removing file from program\n4 - Automatically detect topics in uploaded documents\n5 - Visualization of word probability results\n6 - Download results to a new .txt file\n7 - Exit\n"))
    if option < 1 or option > 7:
      print("Enter only positive number from the menu please !")
      continue

    if option == 1:
      viewing(texts)
    elif option == 2:
      add(texts)
    elif option == 3:
      remove(texts)
    elif option == 4:
      topicmodeling(texts)
    elif option == 5:
      visualization()
    elif option == 6:
      save()
    elif option == 7:
      try:
        removeall = [os.remove(file) for file in os.listdir(".") if os.path.isfile(file) and file != "sample_data"]
        print("\nAutomatic virtual disk cleanup before completion was successful, thank you for your trust!")
      except:
        print("Error in automatic cleaning!")
      print("Exit!")
      break
  except:
    print("Enter only positive number please!")