In [None]:
using MySQL
using DataFrames
using TextAnalysis
conn = mysql_connect("mysql", "jovyan", "jovyan", "mimic2")

### Use Pandas and SQL to create a dataframe with the following:
* subject_id
* hospital admission id
* text of the radiology report
* Limit the number of reports to 10000

In [None]:
rad_data = 
mysql_execute(conn, """SELECT DISTINCT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000""")
head(rad_data)

In [None]:
size(rad_data)

### Write a function that returns the impression section of a report

#### Hints

* Not every report will have an impression section
* "INTERPRETATION" and "CONCLUSIONS" might be synonyms for "IMPRESSION"

In [None]:
function get_impression(report)
    headers = ("IMPRESSION:", "INTERPRETATION:", "CONCLUSIONS:")
    for h in headers
        if contains(report, h)
            return split(report, h)[2]
        end
    end
    return report
end
        


In [None]:
println(rad_data[:text][21])

#### In how many reports did we find an impression section?

In [None]:
function count_impression(report)
    headers = ("IMPRESSION:", "INTERPRETATION:", "CONCLUSIONS:")
    for h in headers
        if contains(report, h)
            return true
        end
    end
    return false
end


In [None]:
sum([count_impression(report) for report in rad_data[:text]])

### Define Regular expressions for data cleansing

* Write a regular expression to replace dates in the reports with ``[**DATE**]``
* Write a regular expression to replace times in the reports with ``[**TIME**]``
* Write a regular expression to replaces digits with "d", (e.g. "43 cc" would become "dd cm")

#### Hints

* Look at some sample reports to see what dates and times look like in the reports
* What order would you need to apply the regular expressions?

In [None]:
println(rad_data[1,:text])

In [None]:
for i in 1:10
    println(rad_data[i,:text])
    println(repeat("*",42))
    println()
end

In [None]:
date = r"""(DATE: \[\*\*\d{4,4}-\d{1,2}-\d{1,2}\*\*\] \d{1,2}:\d{1,2} (AM|PM))"""i

In [None]:
println(rad_data[1,:text])

In [None]:
matchall(date, rad_data[1,:text])

In [None]:
rd = r"""\d"""

In [None]:
println(replace(rad_data[1, :text], rd, "d"))

In [None]:
rad_data[:impression] = map((x) -> get_impression(x), rad_data[:text]);

In [None]:
head(rad_data)

### How many unique words occur in the corpus?

#### Hint

1. Use TextBlob
1. Put all the reports into a single string

In [None]:
unique_impression_words = set(TextBlob(" ".join(rad_data["impression"])).words)
len(unique_impression_words)

In [None]:
using Languages

In [None]:
for w in stopwords(EnglishLanguage)
    println(w)
end

In [None]:
indefinite_articles(EnglishLanguage)

In [None]:
prepositions(EnglishLanguage)

## Create a single string with all the reports

#### Hints, etc.
* Use List Comprehension
* Use string joins
* Iterate over the rows of the data frame

In [None]:
reports_txt = join(rad_data[:impression], " ");

### Define a vector space for the radiology corpus

#### Hints

1. How would you build a corpus from words only occuring more than N times?

### Write a function to drop [stop words](https://en.wikipedia.org/wiki/Stop_words) from a report

* The gensim package has stop words defined (``from gensim.parsing.preprocessing import STOPWORDS``

#### Hints
1. Do you agree with dropping all the stop words?
1. How could we create a new stopwords frozen set absent the terms we wan't to keep (double negative?)
1. You could use a regular expressions substitution or token the report first and operate on the list of words.

In [None]:
function remove_stop(txt)
    txt2 = StringDocument(txt)
    remove_stop_words!(txt2)
    return txt2
end

In [None]:
text(remove_stop("hello in there"))

In [None]:
rad_data[:impression_no_stops] = map((x) -> text(remove_stop(x)), rad_data[:text]);

In [None]:
println(rad_data[1,:impression])

In [None]:
println(rad_data[1,:impression_no_stops])

In [None]:
function get_words(txt)
    sd = StringDocument(txt)
    remove_punctuation!(sd)
    return tokens(sd)
end

In [None]:
rad_data[1,:impression_no_stops])

In [None]:
unique_impression_words = Set(get_words(replace(lowercase(join(rad_data[:impression_no_stops])),rd,'d')))


In [None]:
length(unique_impression_words)

In [None]:
word_map = Dict(zip(unique_impression_words,1:length(unique_impression_words)));

In [None]:
word_map["brain"]

In [None]:
td = TokenDocument("This is sentence 1. This is sentence 2, isn't it?")
tokens(td)

In [None]:
ngblob = NGramDocument(join(rad_data[:impression], " "))

In [None]:
ngrams(ngblob, 2)

In [None]:
sblob = sentences(StringDocument(join(rad_data[:impression], " ")))

In [None]:
ngrams(sblob,2)