# Learning Goal
Understand why natural language processing and text representation are important, the different ways to represent text, and how to implement a few simple textual representations

In [63]:
# from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.summarization.textcleaner import split_sentences, tokenize_by_word
# conda install -c anaconda gensim
# pip install --upgrade gensim

In [7]:
# Process the original pubmed download. This is just so you can see how it's done. We won't work with the xml file.
n_abs = 0
with open("data/pubmed_sample.txt", "w") as outfile:
    with open("data/pubmed20n0001.xml", "r") as pubmed_file:
        for line in pubmed_file:
            if "<AbstractText>" in line:
                line = line.strip()
                line = line.replace("<AbstractText>", "").replace("</AbstractText>", "")
                outfile.write(line + "\n")
                n_abs += 1
print(n_abs, "abstracts processed")

In [9]:
abstract_list = []
with open("data/pubmed_sample.txt", "r") as abstract_file:
    for line in abstract_file:
        abstract_list.append(line.strip())
print(len(abstract_list), "abstracts read in")
# TODO need to split into sentences

15437 abstracts read in


In [10]:
for i in range(5):
    print(abstract_list[i])
    print("***************************************************\n\n\n")

(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.
***************************************************



A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development

In [60]:
sentence_list = []
for abstract in abstract_list:
    sentences = split_sentences(abstract)
    for sentence in sentences:
        sentence_list.append(sentence)
print(len(sentence_list), "sentences extracted")

101724 sentences extracted


In [32]:
for i in range(5):
    print(sentence_list[i])
    print("***************************************************\n")

(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value.
***************************************************

The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5.
***************************************************

The antipeptic action of bisabolol only occurs in case of direct contact.
***************************************************

In case of a previous contact with the substrate, the inhibiting effect is lost.
***************************************************

A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104.
***************************************************



In [57]:
sentence_list[0]

'(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value.'

In [67]:
# One more step. Word2Vec expects a lists of text, where each text is a list of tokens, or words.
abstract_list_tokenized = []
for abstract in abstract_list:
    tokens = list(tokenize_by_word(abstract))
    abstract_list_tokenized.append(tokens)

In [73]:
sentence_list_tokenized = []
for sentence in sentence_list:
    tokens = list(tokenize_by_word(sentence))
    sentence_list_tokenized.append(tokens)

In [70]:
abstract_list[0]

'(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.'

In [71]:
abstract_list_tokenized[0]

['alpha',
 'bisabolol',
 'has',
 'a',
 'primary',
 'antipeptic',
 'action',
 'depending',
 'on',
 'dosage',
 'which',
 'is',
 'not',
 'caused',
 'by',
 'an',
 'alteration',
 'of',
 'the',
 'ph',
 'value',
 'the',
 'proteolytic',
 'activity',
 'of',
 'pepsin',
 'is',
 'reduced',
 'by',
 'percent',
 'through',
 'addition',
 'of',
 'bisabolol',
 'in',
 'the',
 'ratio',
 'of',
 'the',
 'antipeptic',
 'action',
 'of',
 'bisabolol',
 'only',
 'occurs',
 'in',
 'case',
 'of',
 'direct',
 'contact',
 'in',
 'case',
 'of',
 'a',
 'previous',
 'contact',
 'with',
 'the',
 'substrate',
 'the',
 'inhibiting',
 'effect',
 'is',
 'lost']

In [126]:
model_abstract = Word2Vec(
                sentences = abstract_list_tokenized, # corpus we're using to train on
                size=100, # dimension of the word embeddings
                window=5, # max distance between a current and predicted word in a sentence.
                min_count=1,# words must occur at least min_count times to be learned.
                workers=6, # number of threads to use to train the model.
                iter=5, # How many times to iterate through the data
                )

In [None]:
model_sentence = Word2Vec(
                sentences = sentence_list_tokenized, # corpus we're using to train on
                size=100, # dimension of the word embeddings
                window=5, # max distance between a current and predicted word in a sentence.
                min_count=5,# words must occur at least min_count times to be learned.
                workers=6, # number of threads to use to train the model.
                iter=5, # How many times to iterate through the data
                )

In [115]:
embeddings = model_abstract.wv

In [116]:
embeddings.vectors.shape

(49667, 100)

In [108]:
embeddings.most_similar("dosage")

[('doses', 0.7100783586502075),
 ('dosages', 0.6892666816711426),
 ('dose', 0.6707139015197754),
 ('cost', 0.6541118621826172),
 ('daily', 0.646964430809021),
 ('therapeutic', 0.6453495025634766),
 ('therapy', 0.6448436975479126),
 ('medication', 0.6346715688705444),
 ('lorazepam', 0.6234850883483887),
 ('diazepam', 0.6096578240394592)]

In [117]:
embeddings.most_similar("lower")

[('higher', 0.9615294337272644),
 ('larger', 0.8482336401939392),
 ('smaller', 0.8306859731674194),
 ('greater', 0.8293726444244385),
 ('faster', 0.778285026550293),
 ('slower', 0.7706201076507568),
 ('less', 0.7581825256347656),
 ('hotter', 0.7533649206161499),
 ('weaker', 0.7118234634399414),
 ('shorter', 0.7113118767738342)]

In [119]:
embeddings.most_similar("mouse")

[('chick', 0.8806798458099365),
 ('embryo', 0.8421338796615601),
 ('hamster', 0.8413895964622498),
 ('spleen', 0.830163836479187),
 ('embryonic', 0.8244233131408691),
 ('fibroblasts', 0.8156878352165222),
 ('thymus', 0.8152838945388794),
 ('chicken', 0.8092774152755737),
 ('leukocytes', 0.7919324636459351),
 ('ascites', 0.791671872138977)]

In [121]:
embeddings.most_similar("doctor")

[('faculty', 0.9428253173828125),
 ('planning', 0.9331449866294861),
 ('continuing', 0.9328274726867676),
 ('rehabilitation', 0.9313790798187256),
 ('item', 0.9312576055526733),
 ('nurses', 0.9282447099685669),
 ('professional', 0.9276617765426636),
 ('team', 0.9270298480987549),
 ('offering', 0.9269712567329407),
 ('programme', 0.9234572649002075)]

In [122]:
embeddings.most_similar("patient")

[('child', 0.869658350944519),
 ('children', 0.8294508457183838),
 ('disease', 0.8209226727485657),
 ('patients', 0.8181960582733154),
 ('syndrome', 0.8059149980545044),
 ('symptoms', 0.7952113747596741),
 ('illness', 0.7797609567642212),
 ('woman', 0.7770554423332214),
 ('complication', 0.7749495506286621),
 ('physician', 0.7711752653121948)]

In [123]:
embeddings.most_similar("man")

[('humans', 0.7934281826019287),
 ('testes', 0.7546995878219604),
 ('quiescence', 0.7312958240509033),
 ('persons', 0.7307960987091064),
 ('dog', 0.727771520614624),
 ('neonate', 0.7184747457504272),
 ('testicular', 0.7175263166427612),
 ('animal', 0.7013948559761047),
 ('cryptorchism', 0.700571596622467),
 ('boys', 0.6866430044174194)]

In [124]:
embeddings.most_similar("woman")

[('girl', 0.9541659355163574),
 ('boy', 0.937836229801178),
 ('child', 0.9281169176101685),
 ('phobia', 0.8958944082260132),
 ('undescended', 0.8769272565841675),
 ('infant', 0.8769197463989258),
 ('recurrent', 0.8762044906616211),
 ('arthritis', 0.8761972188949585),
 ('febrile', 0.872424840927124),
 ('survivors', 0.870413064956665)]

In [125]:
embeddings.most_similar("dna")

[('rna', 0.9284805059432983),
 ('collagen', 0.7629274129867554),
 ('protein', 0.7380354404449463),
 ('phage', 0.7110795974731445),
 ('polymerase', 0.708604097366333),
 ('chromatin', 0.6936800479888916),
 ('stranded', 0.6845724582672119),
 ('actin', 0.6819810271263123),
 ('peptidoglycan', 0.6817463636398315),
 ('ribosomal', 0.6722898483276367)]