# Using the Gensim Phrases Module

In [None]:
%matplotlib inline

In [None]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob
import re
from gensim.models import phrases
from IPython.display import clear_output
import pickle
import gzip

## Select Some Text from the MIMIC2 Database

In [None]:
rad_data = \
pd.read_sql("""SELECT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000""",conn)
rad_data.head(5)

## We need to get all the reports into a single string
#### This is a great application for list comprehension

* Remember the ``join`` method of a string ``a`` joins a list of string separated by the value of ``a``. For example "\n".join(["1","2","3"])

In [None]:
"\n".join(["1","2","3"])

In [None]:
big_string = " ".join([row["text"] for _,row in rad_data.iterrows()])
blob = TextBlob(big_string.lower())

In addition to splitting the text into words (and tokens), the TextBlob object also splits the text into sentences uses standard English rules. There will be lots of mistakes.

``blob.sentences`` will be a list of sentence objects

In [None]:
sentences = blob.sentences

#### Sentence objects have word list attributes, token, word_counts, etc.

In [None]:
s = sentences[0]
dir(s)

### Phrase detection is done at the sentence level
``phrases.Phrases`` needs a list of lists of words

In [None]:
sentences2 = [s.words for s in sentences]

### We build our phrase detectors recursively
* We first detect two-word phrases
* We then pass the output of the two-word phrase detector to detect three-word phrases, and so on

In [None]:
bigram_generator = phrases.Phrases(sentences2)

In [None]:
trigram_generator =phrases.Phrases(bigram_generator[sentences2])


### ``Phrases`` takes keyword arguments

#### The one we might be most interessted in is

* ``min_count`` with default of 5: The minimum number of observations in this corpus to be condidered a pattern

In [None]:
help(phrases.Phrases)

## Create a Report Browser

In [None]:
import re
rd = re.compile(r"\d")

In [None]:
num_reports = rad_data.shape[0]
while True:
    try:
        i = int(input("Enter a number between 0 and %d. otherwise to quit"%num_reports))
        clear_output()

        if i < 0 or i >=num_reports:
            break
        txt = TextBlob(rd.sub("""d""", rad_data.iloc[i]['text'].strip().lower()))
        print(" ".join(trigram_generator[bigram_generator[txt.tokens]]))
        
    except ValueError:
        break


### Look at what phrases were detected

## Wrangling Doesn't Always Do What You Want

>technique : multiplanar_td and td-weighted_images of the brain with gadolinium_according to standard departmental protocol .

In [None]:
found_phrases = [w for w in trigram_generator[bigram_generator[blob.words]] if "_" in w]