In [9]:
import ast
import codecs
import json
import os
import re
from json import JSONDecodeError
from sys import stderr
from collections import Counter

TEXT_COLUMNS = {
    "all_reviews_texts.txt": ["description"],
    "comments.json": ["content", "pos_content", "neg_content"],
    "consumers_drugs_reviews.json": ["comment"],
    "doctors_drugs_reviews.json": ["comment", "comment_plus", "comment_minus"],
    "spr-ru.txt": ["text"]
}
PROCESS_AS_JSON_DOC = {
    "all_reviews_texts.txt": False,
    "comments.json": True,
    "consumers_drugs_reviews.json": True,
    "doctors_drugs_reviews.json": True,
    "spr-ru.txt": False
}

# Preprocessing
To obtain files from PROCESS_AS_JSON_DOC, download the [raw part of the RuDReC corpus](https://yadi.sk/d/kCsAhkoLZUuTrQ) (see the README file). 

Preprocessing is adopted from:

https://github.com/akutuzov/webvectors/blob/master/preprocessing/modular_processing/unify.py

We unify letters to decrease the size of dictionary. We also unify and remove all punctuation.

In [5]:
def list_replace(search, replacement, text):
    """
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    """
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace \
        ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

    text = list_replace \
        ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

    text = list_replace('\u2010\u2011', '\u002D', text)

    text = list_replace \
            (
            '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
            '\u2002', text)

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace \
            (
            '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
            '.', text)

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?"\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list \
            (
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
        )

    alphabet = list \
            (
            '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text



In [20]:
def process_fulldoc_as_json(input_file, filename, output_file, counter, min_document_length):
    """
    The function reads the whole input_file into memory and
    parses it as a one huge JSON list of documents. 
    """
    file_text = input_file.read()
    docs = json.loads(file_text)
    for doc in docs:
        for text_field in TEXT_COLUMNS[filename]:
            document_text = doc[text_field]
            if document_text is not None:
                preprocessed_text = clean_text(document_text).lower().strip()
                preprocessed_tokens = preprocessed_text.split()
                counter.update(preprocessed_tokens)
                if len(preprocessed_tokens) >= min_document_length:
                    output_file.write(f"{preprocessed_text}\n")


def process_jsondoc_linewise(input_file, filename, output_file, counter, min_document_length):
    """
    This function reads input_file line-wise and processes
    one JSON document at a time.
    """
    for line in input_file:
        try:
            line = line.strip('\n,')
            if filename == "spr-ru.txt":
                doc = ast.literal_eval(line)
            else:
                doc = json.loads(line)
            for text_field in TEXT_COLUMNS[filename]:
                document_text = doc[text_field]
                if document_text is not None:
                    preprocessed_text = clean_text(document_text).lower().strip()
                    preprocessed_tokens = preprocessed_text.split()
                    counter.update(preprocessed_tokens)
                    if len(preprocessed_tokens) >= min_document_length:
                        output_file.write(f"{preprocessed_text}\n")
        except JSONDecodeError as e:
            pass

Lines shorter than `MIN_DOCUMENT_LENGTH` are ignored and discarded from the preprocessed corpus. We discard tokens that occur less than `MIN_TOKEN_FREQUENCY` in our corpus. Each line of the file located at `output_path` corresponds to exactly one document. During preprocessing, a temp file with the "temp" prefix is created. This file will be removed at the end of preprocessing, do not remove it manually before the preprocessing is finished. The preprocessing is expected to take a few tens of minutes.

In [26]:
corpus_directory = r"RuDReC/"
output_path = "preprocessed_corpus.txt"
MIN_DOCUMENT_LENGTH = 3 
MIN_TOKEN_FREQUENCY = 5
counter = Counter()

In [28]:
with codecs.open(f"temp_{output_path}", "w+", encoding="utf-8") as output_file:
    for filename in os.listdir(corpus_directory):
        print(f"Preprocessing {filename}")
        assert filename in TEXT_COLUMNS.keys()
        file_path = os.path.join(corpus_directory, filename)
        with codecs.open(file_path, "r", encoding="utf-8") as inp:
            if PROCESS_AS_JSON_DOC[filename]:
                process_fulldoc_as_json(input_file=inp, filename=filename, output_file=output_file, 
                                        min_document_length=MIN_DOCUMENT_LENGTH, counter=counter)
            else:
                process_jsondoc_linewise(input_file=inp, filename=filename, output_file=output_file, 
                                        min_document_length=MIN_DOCUMENT_LENGTH, counter=counter)
        print(f"Preprocessed {filename}")
        
print("Filtering rare tokens")
with open(f"temp_{output_path}", "r", encoding="utf-8") as inp_file,\
    open(output_path, "w+", encoding="utf-8") as output_file:
    for line in inp_file:
        new_line = " ".join([token for token in line.split() if counter[token] >= MIN_TOKEN_FREQUENCY])
        output_file.write(f"{new_line}\n")
print("Finished filtering rare tokens")
            
os.remove(f"temp_{output_path}")

Preprocessing all_reviews_texts.txt
Preprocessed all_reviews_texts.txt
Preprocessing comments.json
Preprocessed comments.json
Preprocessing consumers_drugs_reviews.json
Preprocessed consumers_drugs_reviews.json
Preprocessing doctors_drugs_reviews.json
Preprocessed doctors_drugs_reviews.json
Preprocessing spr-ru.txt
Preprocessed spr-ru.txt
Filtering rare tokens
Finished filtering rare tokens


# Training Fasttext embeddings

In [37]:
# %pip install fasttext

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [39]:
import fasttext

The list of all available hyperparameters is available at the official documentation:

https://fasttext.cc/docs/en/python-module.html#usage-overview

Configuring training hyperparameters:

In [40]:
data_path = output_path
model_type = 'skipgram' # skipgram or cbow
thread = 6 # Number of parallel workers 
epoch = 10
dim=200 # Embedding size
model_path = "fasttext_model.bin" # The trained model will be saved to this path

Training a Fasttext model:

In [None]:
model = fasttext.train_unsupervised(data_path, model_type, thread=thread,
                                    epoch=epoch, dim=dim)

model.save_model(model_path)