# Using the Gensim Phrases Module

In [1]:
%matplotlib inline

In [2]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob
import re
from gensim.models import phrases
from IPython.display import clear_output
import pickle
import gzip

## Select Some Text from the MIMIC2 Database

In [3]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')


Enter MySQL passwd for jovyan········


In [4]:
rad_data = \
pd.read_sql("""SELECT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000""",conn)
rad_data.head(5)

Unnamed: 0,subject_id,hadm_id,text
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...
3,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...
4,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...


## We need to get all the reports into a single string
#### This is a great application for list comprehension

* Remember the ``join`` method of a string ``a`` joins a list of string separated by the value of ``a``. For example "\n".join(["1","2","3"])

In [5]:
"\n".join(["1","2","3"])

'1\n2\n3'

In [6]:
big_string = " ".join([row["text"] for _,row in rad_data.iterrows()])
blob = TextBlob(big_string.lower())

In addition to splitting the text into words (and tokens), the TextBlob object also splits the text into sentences uses standard English rules. There will be lots of mistakes.

``blob.sentences`` will be a list of sentence objects

In [7]:
sentences = blob.sentences

#### Sentence objects have word list attributes, token, word_counts, etc.

In [8]:
s = sentences[0]
dir(s)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cmpkey',
 '_compare',
 '_strkey',
 'analyzer',
 'classifier',
 'classify',
 'correct',
 'detect_language',
 'dict',
 'end',
 'end_index',
 'ends_with',
 'endswith',
 'find',
 'format',
 'index',
 'join',
 'lower',
 'ngrams',
 'noun_phrases',
 'np_counts',
 'np_extractor',
 'parse',
 'parser',
 'polarity',
 'pos_tagger',
 'pos_tags',
 'raw',
 'replace',
 'rfind',
 'rindex',
 'sentiment',
 'sentiment_assessments',
 'split',
 'start',
 'start_index',
 'starts_with',
 'startswith',
 'string',
 'strip',
 'stripped',
 'subjectivity',
 'tags',
 'title',
 'to

### Phrase detection is done at the sentence level
``phrases.Phrases`` needs a list of lists of words

In [9]:
sentences2 = [s.words for s in sentences]

### We build our phrase detectors recursively
* We first detect two-word phrases
* We then pass the output of the two-word phrase detector to detect three-word phrases, and so on

In [10]:
bigram_generator = phrases.Phrases(sentences2)

In [11]:
trigram_generator =phrases.Phrases(bigram_generator[sentences2])




### ``Phrases`` takes keyword arguments

#### The one we might be most interessted in is

* ``min_count`` with default of 5: The minimum number of observations in this corpus to be condidered a pattern

In [12]:
help(phrases.Phrases)

Help on class Phrases in module gensim.models.phrases:

class Phrases(SentenceAnalyzer, PhrasesTransformation)
 |  Detect phrases based on collocation counts.
 |  
 |  Method resolution order:
 |      Phrases
 |      SentenceAnalyzer
 |      PhrasesTransformation
 |      gensim.interfaces.TransformationABC
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, sentence)
 |      Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter.
 |      
 |      If `sentence` is an entire corpus (iterable of sentences rather than a single
 |      sentence), return an iterable that converts each of the corpus' sentences
 |      into phrases on the fly, one after another.
 |      
 |      Parameters
 |      ----------
 |      sentence : {list of str, iterable of list of str}
 |          Sentence or text corpus.
 |      
 |      Returns
 |      -------
 |      {list of str, :class:`gensim.interf

## Create a Report Browser

In [13]:
import re
rd = re.compile(r"\d")

In [None]:
num_reports = rad_data.shape[0]
while True:
    try:
        i = int(input("Enter a number between 0 and %d. otherwise to quit"%num_reports))
        clear_output()

        if i < 0 or i >=num_reports:
            break
        txt = TextBlob(rd.sub("""d""", rad_data.iloc[i]['text'].strip().lower()))
        print(" ".join(trigram_generator[bigram_generator[txt.tokens]]))
        
    except ValueError:
        break


date : [ **dddd-dd-dd** ] dd : dd am_perc_g/g-j tube_plmt_clip # [ **clip number ( radiology ) ddddd** ] reason : perm_gi_access through_pej_tube contrast : conray_amt : dd ********************************* cpt_codes ******************************** * ddddd perc_plcmt_gastromy_tube ddddd plct_gj_tube * * -dd distinct_procedural_service ddddd perc_plcmt_entroclysis_tube * * ddddd iv_conscioutious sedation_pro * **************************************************************************** ______________________________________________________________________________ underlying_medical_condition : dd year_old_man with hypoxia and b/l pna trach_'d and needing more perm feeding access . please_place_pej_not peg_tube in this patient . reason for this_examination : perm_gi_access through_pej_tube ______________________________________________________________________________ final_report_indication : dd-year-old man with hypoxia and bilateral pneumonia , status_post tracheotomy and respiratory_

### Look at what phrases were detected

## Wrangling Doesn't Always Do What You Want

>technique : multiplanar_td and td-weighted_images of the brain with gadolinium_according to standard departmental protocol .

In [None]:
found_phrases = [w for w in trigram_generator[bigram_generator[blob.words]] if "_" in w]