# Phrase2Vec (using stopwords)

This notebook, tries to generate a Phrase2Vec (Word2vec but for phrases) model. 

## Imports

In [1]:
import re
import os
import json
import gzip
from itertools import islice
from collections import Counter

import gensim
import pandas as pd

## Get the data

In [2]:
!mkdir -p data/inputs
!mkdir -p data/experiments
!mkdir -p data/outputts

In [3]:
if not os.path.exists("data/inputs/episte_all.tsv.gz"):
    !wget "http://s3.amazonaws.com/episte-labs/episte_all.tsv.gz" -O "data/inputs/episte_all.tsv.gz"

## Parameters

In [4]:
INPUT_FILENAME = "data/inputs/episte_all.tsv.gz"
INPUT_SEP = "\t"
INPUT_TEXT_COLUMN = 2   # Starting from 0
LINES_CHUNKS = 10000
SAMPLE_SIZE = 50000 # -1 all
EXPERIMENT_NAME = "episte_sample_%s" % SAMPLE_SIZE
STOPWORDS = ["several", "on", "while", "than", "own", "you've", "itself", "above", "such", "over", "they're", "mainly", "because", "theirs", "too", "most", "must", "myself", "that", "why's", "it", "can't", "show", "overall", "she", "he'd", "it's", "can", "under", "no", "she'll", "should", "therefore", "his", "you", "various", "mustn't", "are", "doing", "really", "up", "they'd", "having", "these", "made", "we'll", "into", "you'll", "more", "ought", "especially", "hasn't", "seem", "nor", "shows", "here's", "here", "he's", "is", "at", "ml", "always", "nearly", "during", "ours", "this", "aren't", "rather", "being", "very", "shown", "them", "cannot", "just", "or", "where", "didn't", "another", "they'll", "shouldn't", "wasn't", "for", "when's", "in", "could", "off", "down", "further", "won't", "due", "however", "each", "i'd", "a", "that's", "where's", "enough", "neither", "its", "isn't", "any", "himself", "was", "they've", "etc", "there's", "whom", "both", "other", "by", "within", "not", "been", "below", "be", "once", "make", "does", "did", "before", "through", "shan't", "ourselves", "which", "kg", "their", "again", "thus", "about", "few", "either", "they", "do", "our", "you'd", "some", "don't", "although", "almost", "i'll", "often", "i'm", "she'd", "we'd", "yourselves", "using", "between", "if", "upon", "him", "we", "done", "as", "so", "hers", "me", "she's", "there", "and", "i've", "may", "but", "with", "how", "found", "her", "yours", "might", "then", "we've", "the", "yourself", "what's", "km", "without", "same", "those", "my", "perhaps", "all", "haven't", "of", "why", "has", "had", "regarding", "significantly", "when", "i", "until", "used", "would", "among", "what", "let's", "am", "how's", "who's", "weren't", "mm", "hadn't", "have", "mg", "wouldn't", "showed", "were", "an", "we're", "obtained", "themselves", "who", "your", "out", "to", "doesn't", "he", "herself", "pmid", "against", "use", "you're", "couldn't", "after", "he'll", "only", "also", "mostly", "quite", "seen", "since"]

WORD2VEC_SIZE=300
WORD2VEC_WINDOW=3
WORD2VEC_MIN_COUNT=5
WORD2VEC_EPOCHS=20

TOTAL_WORKERS = 4

EXPERIMENT_PATH = "data/experiments/%s" % EXPERIMENT_NAME
TOKENIZED_DATA_PATH = "%s/prepared.tsv.gz" % EXPERIMENT_PATH
COUNTER_DATA_PATH = "%s/counter.tsv.gz" % EXPERIMENT_PATH

if not os.path.exists(EXPERIMENT_PATH):
    os.makedirs(EXPERIMENT_PATH)


## Prepare the data

### Tokenize

For 50,000 documents takes 55 segs aprox

In [5]:
def get_file_chunks(filepath, lines_chunk):
    with gzip.open(filepath, "rt") as _file:
        while True:
            next_n_lines = list(islice(_file, lines_chunk))
            yield "".join(next_n_lines).lower()
            if not next_n_lines:
                break

def tokenize_text():
    """This is the parts where we are goint to separate the text, according to the following rules.
    We are goint to use the stopwords to separate the different expression, and in that way identify keywords,
    as an alternative to use ngrams. Example (! is used as separator of the keywords): 
        - input: "Timing of replacement therapy for acute renal failure after cardiac surgery"
        - output: "timing!replacement therapy!acute renal failure!cardiac surgery"
    Another example:
        - input: "Acute renal failure (ARF) following cardiac surgery remains a significant cause of mortality. The aim of this study is to compare early and intensive use of continuous veno-venous hemodiafiltration (CVVHDF) with conservative usage of CVVHDF in patients with ARF after cardiac surgery."
        - output: "acute renal failure!arf!following cardiac surgery remains!significant cause!mortality!aim!study!compare early!intensive!continuous veno-venous hemodiafiltration!cvvhdf!conservative usage!cvvhdf!patients!arf!cardiac surgery!"
    """
    index = 0
    with gzip.open(TOKENIZED_DATA_PATH, "wt") as _output:
        # We are going to split the text in chunks to show some progress.
        for text_part in get_file_chunks(INPUT_FILENAME, LINES_CHUNKS):
            # Must be executed in order
            regexs = [
                # Remove all stopwords by a !, we are searching for the stopword (bounded)
                "\\b" + "\\b|\\b".join(STOPWORDS),
                # Remove all non alpha, numeric, spaces, - or single quote
                r'([^a-z0-9\u00C0-\u1FFF\u2C00-\uD7FF \t\n\-\'])',
                # remove only words numbers
                r'\b[0-9]+\b',
                # remove spaces between !
                r' *! *',
                # remove multiple ! (!!!!)
                r'!+',
                # remove one character keyword
                r'!.!',
            ]

            for regex in regexs:
                text_part = re.sub(regex, '!', text_part)
            _output.write(text_part)
            index += 1
            print("%s lines processed" % (index * LINES_CHUNKS), end='\r')
            if SAMPLE_SIZE > 0 and index * LINES_CHUNKS >= SAMPLE_SIZE:
                break

tokenize_text()

50000 lines processed

### Read tokenized data

In [6]:
documents_keywords = []
index = 0
for line in gzip.open(TOKENIZED_DATA_PATH, "rt"):
    documents_keywords.append(line[0:-1].split(INPUT_SEP)[INPUT_TEXT_COLUMN].split("!"))
    index += 1
total_documents = len(documents_keywords)


## Generate the model

For 50,000 documents takes 2min 30seg aprox

In [7]:
model = gensim.models.Word2Vec(
    documents_keywords, size=WORD2VEC_SIZE, window=WORD2VEC_WINDOW,
    min_count=WORD2VEC_MIN_COUNT, workers=TOTAL_WORKERS
)
model.train(documents_keywords, total_examples=total_documents, epochs=WORD2VEC_EPOCHS)
word_vectors = model.wv

## Calculate words frequency

In [8]:
counter = Counter([
    keyword
    for keywords in documents_keywords
    for keyword in keywords
])
counter_frame = pd.DataFrame.from_dict(counter, orient='index').reset_index()
counter_frame = counter_frame.rename(columns={'index':'term', 0:'count'})

## Try the model

In [9]:
word_to_test = ["obesity"]
top_similars = pd.DataFrame(
    word_vectors.most_similar(positive=word_to_test, topn=25),
    columns=["term", "score"]
)
top_similars.merge(counter_frame, on='term', how='left')

  if np.issubdtype(vec.dtype, np.int):


Unnamed: 0,term,score,count
0,overweight,0.612261,598
1,obese individuals,0.578053,39
2,obese,0.555815,152
3,obese adults,0.53913,63
4,childhood obesity,0.515776,100
5,childhood overweight,0.513356,32
6,obese people,0.50779,22
7,cardiovascular diseases,0.473498,150
8,obesity prevention,0.472821,32
9,dental caries,0.470418,148


## Save the vectors

In [10]:
store_model_path = os.path.join(EXPERIMENT_PATH,  "word2vec.vec")
word_vectors.save(store_model_path)
print(EXPERIMENT_PATH)

data/experiments/episte_sample_50000


## Load the vectors

In [11]:
word_vectors = gensim.models.KeyedVectors.load(store_model_path, mmap='r')