# Setup

In [0]:
import nltk
import ast
nltk.download('punkt')

INPUT_FILE_PATH = '/content/drive/My Drive/RUAK/input/processed'
TOKENIZED_PATH = '/content/drive/My Drive/RUAK/output/tokenized'

INPUT_FILE = 'pg_kant.txt'
DOC2VEC_INPUT_FILES = ['pg_kant.txt']

MODEL_PREFIX = 'xxx'
DOC2VEC_PREFIX = 'kant_platon'
SIZE = 300
EPOCHS = 30
WINDOW = 30
IDENTIFIER_NO = 14
MIN_COUNT = 10

# Word embedding

### Tokenize

In [0]:
with open(f'{INPUT_FILE_PATH}/{INPUT_FILE}', encoding='UTF-8') as file:
  sentences = nltk.sent_tokenize(file.read(), language='german')
  tokenized_text = []
  for sentence in sentences:
    if ' ' in sentence == False:
      continue
    if len(sentence) <= 20:
      continue
    tokenized_text.append(nltk.word_tokenize(sentence, language='german'))
  print(f'Created {len(tokenized_text)} tokens.')
  print('Preview:')
  print(tokenized_text[1])

file_name = INPUT_FILE.replace('.txt', '')
with open(f'{TOKENIZED_PATH}/_{file_name}_tokenized.txt', 'w') as outfile:
  for entry in tokenized_text:
    outfile.write(''.join(str(entry)) + '\n')

## FastText

### Load tokens and build vocabulary

In [0]:
from gensim.models import FastText

file_name = INPUT_FILE.replace('.txt', '')

loaded_tokenized_text = []
with open(f'{TOKENIZED_PATH}/_{file_name}_tokenized.txt', 'r') as infile:
  for line in infile:
    line = ast.literal_eval(line)
    loaded_tokenized_text.append(line)

print(f'Loaded from file: {loaded_tokenized_text[:2]} ...')

model = FastText(size=SIZE, window=WINDOW, min_count=MIN_COUNT)
model.build_vocab(sentences=loaded_tokenized_text)

### Train

In [0]:
model.train(sentences=loaded_tokenized_text, total_examples=len(loaded_tokenized_text), epochs=EPOCHS)

ft_model_output_path = '/content/drive/My Drive/RUAK/output/embedding/ft'
file_name = f'{MODEL_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-FT.model'
model.save(f'{ft_model_output_path}/{file_name}')
model.wv.save_word2vec_format(f'{ft_model_output_path}/{file_name}-bin.kv', binary=True)
model.wv.save_word2vec_format(f'{ft_model_output_path}/{file_name}-txt.kv', binary=False)

## Word2Vec

### Load tokens

In [0]:
file_name = INPUT_FILE.replace('.txt', '')

loaded_tokenized_text = []
with open(f'{TOKENIZED_PATH}/_{file_name}_tokenized.txt', 'r') as infile:
  for line in infile:
    line = ast.literal_eval(line)
    loaded_tokenized_text.append(line)

print(f'Loaded from file: {loaded_tokenized_text[:2]} ...')

### Train

In [0]:
from gensim.models import Word2Vec
import multiprocessing

model = Word2Vec(loaded_tokenized_text, size=SIZE, window=WINDOW, min_count=MIN_COUNT, workers=multiprocessing.cpu_count())

w2v_model_output_path = '/content/drive/My Drive/RUAK/output/embedding/w2v'
file_name = f'{MODEL_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-W2V.model'
model.save(f'{w2v_model_output_path}/{file_name}')
model.wv.save_word2vec_format(f'{w2v_model_output_path}/{file_name}-bin.kv', binary=True)
model.wv.save_word2vec_format(f'{w2v_model_output_path}/{file_name}-txt.kv', binary=False)

# Doc2Vec

## Sentences

### Prepare sentences

In [0]:
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
import pickle

documents = []
for index, file_name in enumerate(DOC2VEC_INPUT_FILES):
  with open(f'{INPUT_FILE_PATH}/{file_name}', encoding='UTF-8') as file:
    sentences = nltk.sent_tokenize(file.read(), language='german')
    for sentence in sentences:

      if ' ' in sentence == False:
        continue
      if len(sentence) <= 20:
        continue
      if sentence[0] == '-':
        sentence = sentence[1:]

      tagged_document = TaggedDocument(nltk.word_tokenize(sentence, language='german'), [index])
      documents.append(tagged_document)

print(f'Found {len(documents)} sentences in {len(DOC2VEC_INPUT_FILES)} files.')
print('Preview:')
print(documents[:len(DOC2VEC_INPUT_FILES)])

with open(f'{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_sentence_tagged', 'wb') as outfile:
  pickle.dump(documents, outfile)

### Load

In [0]:
import pickle

loaded_documents = []
with open(f'{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_sentence_tagged', 'rb') as infile:
  loaded_documents = pickle.load(infile)

print(f'Loaded from file: {loaded_documents[:len(DOC2VEC_INPUT_FILES)]} ...')

## Documents

### Prepare Documents

In [0]:
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
import pickle

documents = []
for index, file_name in enumerate(DOC2VEC_INPUT_FILES):
  with open(f'{INPUT_FILE_PATH}/{file_name}', encoding='UTF-8') as file:
      document =  file.read()
      tagged_document = TaggedDocument(nltk.word_tokenize(document, language='german'), [index])
      documents.append(tagged_document)

with open(f'{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_doc_tagged', 'wb') as outfile:
  pickle.dump(documents, outfile)

### Load

In [0]:
import pickle

loaded_documents = []
with open(f'{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_doc_tagged', 'rb') as infile:
  loaded_documents = pickle.load(infile)

## Train

In [0]:
from gensim.models.doc2vec import Doc2Vec
import multiprocessing

model = Doc2Vec(loaded_documents, vector_size=SIZE, window=WINDOW, min_count=MIN_COUNT, workers=multiprocessing.cpu_count())

d2v_model_output_path = '/content/drive/My Drive/RUAK/output/embedding/d2v'
file_name = f'{DOC2VEC_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-D2V.model'
model.save(f'{d2v_model_output_path}/{file_name}')
model.docvecs.save_word2vec_format(f'{d2v_model_output_path}/{file_name}-bin.kv', binary=True)
model.docvecs.save_word2vec_format(f'{d2v_model_output_path}/{file_name}-txt.kv', binary=False)