In [None]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob
import re

In [None]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')
cursor = conn.cursor()

### Use Pandas and SQL to create a dataframe with the following:
* subject_id
* hospital admission id
* text of the radiology report
* Limit the number of reports to 10000

In [None]:
rad_data = \
pd.read_sql("""SELECT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000""",conn)
rad_data.head(5)

In [None]:
rad_data.shape

In [None]:
print(rad_data.iloc[20]["text"])

### Write a function that returns true or false depending on whether the report contains an impression section

#### Hints

* Not every report will have an impression section
* "INTERPRETATION" and "CONCLUSIONS" might be synonyms for "IMPRESSION"

In [None]:
def count_impression(report):
    pass


In [None]:
sum([count_impression(report) for report in rad_data["text"]])

### Write a function that returns the impression section of a report



In [None]:
def get_impression(report):
    pass    


### Define Regular expressions for data cleansing

We have a lot of patterns that are unique and don't convey meaningful information

* De-identified names, dates, etc. 
    * `[**Clip Number (Radiology) 12569**]`
    * `[**Hospital 12568**]`
    * `[**Last Name (NamePattern4) 337**]` 
    * `[**First Name8 (NamePattern2) 12565**]` 
    * `[**Last Name (NamePattern1) 12566**]`
* Separators  (e.g. `__________________`)

### Split into groups and write and test regular expressions to capture these patterns
* Write a regular expression to replace dates in the reports with ``[**DATE**]``
* Write a regular expression to replace times in the reports with ``[**TIME**]``
* Write a regular expression to replaces digits with "d", (e.g. "43 cc" would become "dd cm")

In [None]:
date=re.compile(r"""((?P<month>[A-Z][a-z]{2,}(\.)?) (?P<day>[0-9]{1,2}))""")
last_name=re.compile(r"""\[\*\*Last Name \(NamePattern\d+\) \*\*\]""")

age2 = re.compile(r"""(?P<age>[0-9]+)(-|\s)y(ear(s)?|\.)(-|\s)?o(ld|\.)""")
age3 = re.compile(r"""\bage(d)? (?P<age>[0-9]+)""")
digits = re.compile(r"""\d""")
def age_in_decades(m):
    age = int(m.group("age"))
    
    return "[** Age in %ss**]"%(int(age/10)*10,)

age_in_decades(next(age2.finditer("74-year-old")))
tmp = re.sub(age2, age_in_decades, re.sub(age3, age_in_decades, report_txt))

#### Hints

* Look at some sample reports to see what dates and times look like in the reports
* What order would you need to apply the regular expressions?

In [None]:
for i in range(10):
    print(rad_data.iloc[i]['text'])
    print("*"*42,"\n\n")

### You can use the Pandas `iloc` method to grab specific reports

In [None]:
print(rad_data.iloc[0]["text"])

In [None]:
print(rad_data.iloc[0]["text"])

### Create a new column named "impression" for storing just the impression portion of the report

In [None]:
rad_data["impression"] = \
rad_data.apply(lambda row: digits.sub("d", get_impression(row["text"])).lower(), axis=1)

In [None]:
rad_data.head()

In [None]:
print(rad_data.iloc[20]["impression"])

### How many unique words occur in the corpus?

#### Hint

1. Use TextBlob
1. Put all the reports into a single string

#### I got 8658

In [None]:
#" ".join(rad_data["impression"])

In [None]:
unique_impression_words = set(TextBlob(" ".join([report for report in rad_data["impression"]])).words)
len(unique_impression_words)



In [None]:
#unique_impression_words

In [None]:
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS

In [None]:
my_stop_words = frozenset(["a", "am", "an", "and", "are", "as", "at", "be", "for", "is", "the", "is", "of", "which", ])

## Create a single string with all the reports

#### Hints, etc.
* Use List Comprehension
* Use string joins
* Iterate over the rows of the data frame

### Define a vector space for the radiology corpus

#### Hints

1. How would you build a corpus from words only occuring more than N times?

### Create a new column named `"impression no stops"` where [stop words](https://en.wikipedia.org/wiki/Stop_words) have been dropped from the impression

* The gensim package has stop words defined (``from gensim.parsing.preprocessing import STOPWORDS``

#### Hints
1. Do you agree with dropping all the stop words?
1. How could we create a new stopwords frozen set absent the terms we wan't to keep (double negative?)
1. You could use a regular expressions substitution or token the report first and operate on the list of words.

In [None]:
rad_data["impression no stops"] = \
rad_data.apply(???), axis=1)

In [None]:
rad_data.iloc[0]["impression"]

In [None]:
rad_data.iloc[0]["impression no stops"]

### What are the unique words in our vocabulary?

In [None]:
unique_impression_words = ???


### We'll create a vocabulary with `zip` and `dict`

In [None]:
word_map = dict(zip(unique_impression_words,range(len(unique_impression_words))))

In [None]:
len(word_map)

In [None]:
word_map

### Save for use in our next notebook

In [None]:
import gzip
import pickle

with open("rad_data.pickle.gz", "wb") as f0:
    pickle.dump(rad_data, f0)

In [None]:
with open("rad_vocabulary.pickle.gz", "wb") as f0:
    pickle.dump(word_map, f0)