# Setup

In [None]:
import multiprocessing, os, ast, nltk, pickle

nltk.download("punkt")

In [None]:
INPUT_FILE_PATH = 
TOKENIZED_PATH = 

INPUT_FILE = 
DOC2VEC_INPUT_FILES_PATH = 
D2V_MODEL_OUTPUT_PATH =
W2V_MODEL_OUTPUT_PATH = 
FT_MODEL_OUTPUT_PATH =

MODEL_PREFIX = 
DOC2VEC_PREFIX = 
SIZE = 300
EPOCHS = 50
WINDOW = 30
IDENTIFIER_NO = 1
MIN_COUNT = 10

LANGUAGE = 'english'

# Word embedding

### Tokenize

In [None]:
with open(f"{INPUT_FILE_PATH}/{INPUT_FILE}", encoding="UTF-8") as file:
    sentences = nltk.sent_tokenize(file.read(), language=LANGUAGE)
    tokenized_text = []
    for sentence in sentences:
        if " " in sentence == False:
            continue
        if len(sentence) <= 20:
            continue
        tokenized_text.append(nltk.word_tokenize(sentence, language=LANGUAGE))
    print(f"Created {len(tokenized_text)} tokens.")
    print("Preview:")
    print(tokenized_text[1])

file_name = INPUT_FILE.replace(".txt", "")
with open(f"{TOKENIZED_PATH}/_{file_name}_tokenized.txt", "w") as outfile:
    for entry in tokenized_text:
        outfile.write("".join(str(entry)) + "\n")

## FastText

### Load tokens and build vocabulary

In [None]:
from gensim.models import FastText

file_name = INPUT_FILE.replace(".txt", "")

loaded_tokenized_text = []
with open(f"{TOKENIZED_PATH}/_{file_name}_tokenized.txt", "r") as infile:
    for line in infile:
        line = ast.literal_eval(line)
        loaded_tokenized_text.append(line)

print(f"Loaded from file: {loaded_tokenized_text[:2]} ...")

model = FastText(size=SIZE, window=WINDOW, min_count=MIN_COUNT)
model.build_vocab(sentences=loaded_tokenized_text)

### Train

In [None]:
model.train(
    sentences=loaded_tokenized_text,
    total_examples=len(loaded_tokenized_text),
    epochs=EPOCHS,
)
file_name = f"{MODEL_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-FT.model"
model.save(f"{FT_MODEL_OUTPUT_PATH}/{file_name}")
model.wv.save_word2vec_format(f"{FT_MODEL_OUTPUT_PATH}/{file_name}-bin.kv", binary=True)
model.wv.save_word2vec_format(
    f"{FT_MODEL_OUTPUT_PATH}/{file_name}-txt.kv", binary=False
)

## Word2Vec

### Load tokens

In [None]:
file_name = INPUT_FILE.replace(".txt", "")

loaded_tokenized_text = []
with open(f"{TOKENIZED_PATH}/_{file_name}_tokenized.txt", "r") as infile:
    for line in infile:
        line = ast.literal_eval(line)
        loaded_tokenized_text.append(line)

print(f"Loaded from file: {loaded_tokenized_text[:2]} ...")

### Train

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(
    loaded_tokenized_text,
    size=SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=multiprocessing.cpu_count(),
)
file_name = f"{MODEL_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-W2V.model"
model.save(f"{W2V_MODEL_OUTPUT_PATH}/{file_name}")
model.wv.save_word2vec_format(
    f"{W2V_MODEL_OUTPUT_PATH}/{file_name}-bin.kv", binary=True
)
model.wv.save_word2vec_format(
    f"{W2V_MODEL_OUTPUT_PATH}/{file_name}-txt.kv", binary=False
)

# Doc2Vec

## Sentences

### Preparation

In [None]:
from gensim.models.doc2vec import TaggedDocument

documents = []
for file in os.listdir(DOC2VEC_INPUT_FILES_PATH):
    if file == ".DS_Store":
        continue
    with open(f"{DOC2VEC_INPUT_FILES_PATH}/{file}", "r", encoding="UTF-8") as file:
        file_name = file.name.replace(DOC2VEC_INPUT_FILES_PATH, "")
        sentences = nltk.sent_tokenize(file.read(), language=LANGUAGE)
        for sentence in sentences:
            if " " in sentence == False:
                continue
            if len(sentence) <= 20:
                continue
            if sentence[0] == "-":
                sentence = sentence[1:]

            tagged_document = TaggedDocument(
                nltk.word_tokenize(sentence, language=LANGUAGE), [int(file_name[1])]
            )
            documents.append(tagged_document)

print(f"Found {len(documents)} Sentences.")

with open(
    f"{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_sentences_tagged", "wb"
) as outfile:
    pickle.dump(documents, outfile)

### Load

In [None]:
loaded_documents = []
with open(
    f"{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_sentences_tagged", "rb"
) as infile:
    loaded_documents = pickle.load(infile)

print(f"Loaded from file: {loaded_documents[:3]} ...")

## Documents

### Preparation

In [None]:
from gensim.models.doc2vec import TaggedDocument

documents = []
for file in os.listdir(DOC2VEC_INPUT_FILES_PATH):
    if file == ".DS_Store":
        continue
    with open(f"{DOC2VEC_INPUT_FILES_PATH}/{file}", "r", encoding="UTF-8") as file:
        file_name = file.name.replace(DOC2VEC_INPUT_FILES_PATH, "")
        tagged_document = TaggedDocument(
            nltk.word_tokenize(file.read(), language=LANGUAGE), [int(file_name[1])]
        )
        documents.append(tagged_document)

print(f"Found {len(documents)} texts.")

with open(f"{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_doc_tagged", "wb") as outfile:
    pickle.dump(documents, outfile)

### Load

In [None]:
loaded_documents = []
with open(f"{TOKENIZED_PATH}/_{DOC2VEC_PREFIX}_doc2vec_doc_tagged", "rb") as infile:
    loaded_documents = pickle.load(infile)

## Train

In [None]:
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec(
    loaded_documents,
    vector_size=SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=multiprocessing.cpu_count(),
    dm=0,
    dbow_words=1,
)
file_name = (
    f"{DOC2VEC_PREFIX}_{SIZE}_iter{EPOCHS}_win{WINDOW}_{IDENTIFIER_NO}-D2V.model"
)
model.save(f"{D2V_MODEL_OUTPUT_PATH}/{file_name}")
model.docvecs.save_word2vec_format(
    f"{D2V_MODEL_OUTPUT_PATH}/{file_name}-bin.kv", binary=True
)
model.docvecs.save_word2vec_format(
    f"{D2V_MODEL_OUTPUT_PATH}/{file_name}-txt.kv", binary=False
)