In [None]:
# default_exp main

In [None]:
from nbdev.showdoc import *

# Main

> This are the main functions, where we are going to 

In [None]:
# export 
from keywords2vec.imports import *

from glob import glob
from functools import partial

import fasttext

from keywords2vec.utils import parallel, open_file, chunk_of_text, get_file_chunks
from keywords2vec.tokenizer import tokenize

In [None]:
#export

def tokenize_file(
    input_path, output_path="tokenized.txt", lang="en",
    sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
):
    tokenize_wrapper = partial(
        tokenize, lang=lang, text_output=True, merge=True, keywords_w_stopwords=keywords_w_stopwords
    )

    index = 0

    with open(output_path, "wt") as _output:
        for file_path in glob(input_path):
            print("processing file:", file_path)
            # We are going to split the text in chunks to show some progress.
            new_index, text_chunks, break_by_sample = get_file_chunks(index, file_path, lines_chunks, sample_size)
            index = new_index
            results = parallel(tokenize_wrapper, text_chunks, n_cpus)
            _output.write(
                ("\n".join(results) + "\n").replace(" ", "_").replace("!", " ")
            )
            if break_by_sample:
                break
    return output_path


def train_model(input_filename):
    model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)
    return model

def similars_tree_from_model(model, vector_size=100):
    f = 100
    t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
    labels = model.labels
    for index, label in enumerate(labels):
        v = model[label]
        t.add_item(index, v)

    t.build(10) # 10 trees
    return labels, t

def get_similars(tree, labels, keyword, n_similars=10, show_score=False):
    index = labels.index(keyword.replace(" ", "_"))
    suggestions, scores = tree.get_nns_by_item(index, n=15, include_distances=True)
    suggested_labels = [
        labels[suggestion].replace("_", " ")
        for suggestion in suggestions
    ]
    return suggested_labels

def similars_tree(
    input_path, temp_tokenized_file="tmp_tokenized.txt", lang="en",
    sample_size=-1, lines_chunks=-1, n_cpus=-1, keywords_w_stopwords=False
):
    tokenize_file(
        input_path=input_path, output_path=temp_tokenized_file, lang=lang,
        sample_size=sample_size, lines_chunks=lines_chunks, n_cpus=n_cpus,
        keywords_w_stopwords=keywords_w_stopwords
    )
    model = train_model(temp_tokenized_file)
    labels, tree = similars_tree_from_model(model)
    return labels, tree
    

In [None]:
data_url = "https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz"
data_filepath = "epistemonikos_data_sample.tsv.gz"
tokenized_filepath = "tokenized_epistemonikos_data.txt"
!wget "{data_url}" -O "{data_filepath}"

--2020-02-25 11:52:04--  https://s3.amazonaws.com/episte-labs/epistemonikos_data_sample.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.240.38
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.240.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21510551 (21M) [application/gzip]
Saving to: ‘epistemonikos_data_sample.tsv.gz’


2020-02-25 11:52:17 (1.70 MB/s) - ‘epistemonikos_data_sample.tsv.gz’ saved [21510551/21510551]



In [None]:
show_doc(tokenize_file)

<h4 id="tokenize_file" class="doc_header"><code>tokenize_file</code><a href="__main__.py#L3" class="source_link" style="float:right">[source]</a></h4>

> <code>tokenize_file</code>(**`input_path`**, **`output_path`**=*`'tokenized.txt'`*, **`lang`**=*`'en'`*, **`sample_size`**=*`-1`*, **`lines_chunks`**=*`-1`*, **`n_cpus`**=*`-1`*, **`keywords_w_stopwords`**=*`False`*)



In [None]:
tokenize_file(data_filepath, tokenized_filepath)

processing file: epistemonikos_data_sample.tsv.gz


'tokenized_epistemonikos_data.txt'

In [None]:
show_doc(train_model)

<h4 id="train_model" class="doc_header"><code>train_model</code><a href="__main__.py#L28" class="source_link" style="float:right">[source]</a></h4>

> <code>train_model</code>(**`input_filename`**)



In [None]:
model = train_model(tokenized_filepath)

In [None]:
show_doc(similars_tree_from_model)

<h4 id="similars_tree_from_model" class="doc_header"><code>similars_tree_from_model</code><a href="__main__.py#L32" class="source_link" style="float:right">[source]</a></h4>

> <code>similars_tree_from_model</code>(**`model`**, **`vector_size`**=*`100`*)



In [None]:
labels, tree = similars_tree_from_model(model)

In [None]:
show_doc(get_similars)

<h4 id="get_similars" class="doc_header"><code>get_similars</code><a href="__main__.py#L43" class="source_link" style="float:right">[source]</a></h4>

> <code>get_similars</code>(**`tree`**, **`labels`**, **`keyword`**, **`n_similars`**=*`10`*, **`show_score`**=*`False`*)



In [None]:
get_similars(tree, labels, "obesity")

['obesity',
 'overweight',
 'obese children',
 'ssbs',
 'poor sleep quality',
 'metabolic syndrome',
 'obesity among children',
 'dental caries',
 'physical inactivity',
 'obesity may',
 'sedentary behaviour',
 'food allergy',
 'sugar-sweetened beverages',
 'worldwide prevalence',
 'known risk factor']