In [None]:
# @author Alec Chapman
# @date 1/19/2018
# @author Olga Patterson
# @date 10/01/2018

In [None]:
import os, glob
import LexDiscover as ld
from gensim.models import Word2Vec
from helpers import *

# Introduction
This notebook is meant to be used as a simple interface for using the `LexDiscover` toolkit.

## Setup

## Step 1: Add Data

Point to the data.

### Option A: Read from a file/directory
This can be either a directory, in which case all .txt files will be read in, or a single file.

In [None]:
# DATA_PATH: Change this to a path to either a single file or a directory containing .txt files
# If it's a directory, all .txt files in that directory will be read in

DATA_PATH = 'data'
text = read_text_from_file(DATA_PATH, lowercase=True)

### Option B: Read from a SQL database
Needs a database name, an SQL query to complete, and the column name that has the text.

In [None]:
database = ''
query = ''
col_name = 'text'

text = read_SQL_server(database=database, query=query, col_name=col_name, lowercase=True)

## Step 2: Add Base Lexicon of Terms and SNOMED-CT codes.
These are the terms and SNOMED-CT codes that will be used for seeding the algorithms. You can either:
1. Type them in manually going to the the line that says `# BASE_LEX = []` and typing comma-separated terms between the brackets enclosed by double-quotes (ie., `BASE_LEX = ["cardiac arrest", "heart attack"]`)
2. Uncomment the line that says `# BASE_LEX = read_base_lex(path='lex.txt', sep='\n')`. Change to the location of your lexicon. This will read in your lexicon from a file. If the terms are separated by commas, change to `sep=','`. Default is new lines.

For now, SNOMED-CT can only be utilized by entering codes as ints. I suggest you find some and copy and paste them from [the SNOMED browser](http://browser.ihtsdotools.org/?). Future support could automatically map terms in the base lexicon to SNOMED-CT codes.

In [None]:
BASE_LEX = []
BASE_LEX = ["cardiac arrest", "heart attack"]
#BASE_LEX = read_base_lex(path='example_lex.txt', sep='\n')

BASE_CODES = []
BASE_CODES=[56265001,194828000, 410429000]  # Heart disease, Cardiac arrest, Angina (disorder)
#BASE_CODES = read_base_codes(path='example_codes.txt', sep='\n')

print(BASE_LEX)
print(BASE_CODES)

## Step 3: Set Configurations

Set the configurations for the models, or leave the values as is to be default. Here is a description of each parameter:
- `MIN_COUNT`: the number of times a word must occur to be considered. A higher number will yield more precise results. Default is 1 (any word in the corpus is considered).
- `SIM_THRESH`: the minimum similarity between two words to be added by the machine learning algorithm. A higher threshold will yield more precise results. Default is 0.5.
- `EDIT_DIST`: the number of steps of edits for the linguistic algorithm to generate misspellings. A lower number will yield more precise results. Default is 2.
- `PARENTS`: whether to search for synonyms of parent concepts in SNOMED-CT. Default `True`.
- `CHILDREN`: whether to search for synonyms of child concepts in SNOMED-CT. Default `True`.
- `MODELS`: which models you want to use.
    - `'word2vec'`: a machine learnin model that will find new terms using vector similarity. Slightly noisy but has the potential to come up with good words.
    - `'ont'`: uses SNOMED codes to find parent/childrent concepts. The most likely to find synonyms but will not find misspellings or abbreviations.
    - `'wnling'`: linguistic variations, finds misspellings and abbreviations. This is likely to be noisy and may find words that are completely unrelated, but it can also find good terms that aren't in normal vocabulries.

In [None]:
MIN_COUNT = 1
SIM_THRESH = 0.5
EDIT_DIST = 2
PARENTS = True
CHILDREN = True

# Delete any strings you don't want to use
MODELS = ['word2vec', 'ont', 'wnling']

# Step 4: Initiate model and run discovery algorithm
This will return a list, `new_lex`, that contains all new words that have been added to the vocabulary. You can then save the expanded lexicon to a file using the next cell with `write_lex(filename)`

In [None]:
w2v = Word2Vec.load("word2vec.model")
MODELS={}
MODELS['word2vec'] = w2v
print("Loaded word2vec model: ", w2v)

In [None]:
# This will take a minute, especially if a large lexicon was passed in.
discover = ld.AggregateLexDiscover(
    text=text,
    base_lex=BASE_LEX,
    base_codes=BASE_CODES,
    min_count=MIN_COUNT,
    edit_dist=EDIT_DIST,
    sim_thresh=SIM_THRESH,
    models=MODELS,
    parents=PARENTS,
    children=CHILDREN,
)

new_lex = discover.discover_lex()
print("{} new terms added to lexicon".format(len(new_lex)))

In [None]:
# Save Results to file
discover.write_lex('expanded_lex.txt', sep='\n')

# Step 5: Explore results

In [None]:
# Sort new terms by similarity
discover.sort_by_sim()

In [None]:
# Look at all new terms and their frequency in the corpus
for term in discover.get_discovered_terms():
    print(term, discover.get_count(term))

In [None]:
# Look only at words that were discovered using machine learning
for term in discover.get_discovered_terms('word2vec'):
    print(term, discover.get_count(term))

In [None]:
# Look only at words that were discovered using linguistic variations
for term in discover.get_discovered_terms('wnling'):
    print(term, discover.get_count(term))

In [None]:
# Look only at words that were discovered using SNOMED
for term in discover.get_discovered_terms('ont'):
    print(term, discover.get_count(term))

In [None]:
# Look at words that were discovered by two models
for term in discover.get_intersect('wnling', 'word2vec'):
    print(term, discover.get_count(term))

In [None]:
# Or all three
for term in discover.get_intersect('word2vec', 'wnling','ont'):
    print(term, discover.get_count(term))

In [None]:
# See the contexts of a term (surrounding words)
# Prints contexts and probabilities (frequency of context/frequency of term)
# 'PHI' = before sentence
# 'OMEGA' = after sentence
term = 'angina'
num_words_before = 1
num_words_after = 0
remove_stopwords = True

window = (num_words_before, num_words_after)
for context in discover.get_context(term, window=window, remove_stopwords=remove_stopwords):
    print(context)

In [None]:
# Find all sentences containing a word
term_to_find = 'cardiovascular'
num_sentences_to_see = 10

g = discover.search_in_sentences(term_to_find, num_sentences_to_see)
for s in g:
    print ('\n',s)