## Read documents and print their contents
* read a set of documents with filename of the form doc<number>.txt

In [25]:
#for all files
for i in range(4):
    
    filename = "doc" + str(i) + ".txt"
    doc = open(filename, "r", encoding='UTF-8')
    
    text_string = doc.read()

    text_string = text_string.lower()   
    # or: text_string = doc.read().lower() 
    doc.close()
    
    print(text_string + '\n')

background: a group of mirnas can regulate a biological process by targeting genes involved in the process. the unbiased mirna functional enrichment analysis is the most precise in silico approach to predict the biological processes that may be regulated by a given mirna group. however, it is computationally intensive and significantly more expensive than its alternatives.
results: we introduce bufet, a new approach to significantly reduce the time required for the execution of the unbiased mirna functional enrichment analysis. it derives its strength from the utilization of efficient bitset-based methods and parallel computation techniques.
conclusions: bufet outperforms the state-of-the-art implementation, in regard to computational efficiency, in all scenarios (both single- and multi-core), being, in some cases, more than one order of magnitude faster.

diana-tarbase v8 (http://www.microrna.gr/tarbase) is a reference database devoted to the indexing of experimentally supported micro

## Read documents and identify words
* Regular expressions: https://docs.python.org/3/library/re.html
* https://www.w3schools.com/python/python_regex.asp

In [27]:
import re

def word_break(text_br):
    words = re.findall(r"\b[a-z\'\-]{5,20}\b", text_br) 
    return words

In [32]:
w = word_break("this is a regular expression test")
print(w)

['regular', 'expression']


In [33]:
for i in range(4):
    filename = "doc" + str(i) + ".txt"
    
    #doc = open(filename, "r")
    doc = open(filename,"r", encoding='utf-8')
    text_string = doc.read().lower()
    doc.close()

    match_word = word_break(text_string)
    
    print(match_word)
    print()



['background', 'group', 'mirnas', 'regulate', 'biological', 'process', 'targeting', 'genes', 'involved', 'process', 'unbiased', 'mirna', 'functional', 'enrichment', 'analysis', 'precise', 'silico', 'approach', 'predict', 'biological', 'processes', 'regulated', 'given', 'mirna', 'group', 'however', 'computationally', 'intensive', 'significantly', 'expensive', 'alternatives', 'results', 'introduce', 'bufet', 'approach', 'significantly', 'reduce', 'required', 'execution', 'unbiased', 'mirna', 'functional', 'enrichment', 'analysis', 'derives', 'strength', 'utilization', 'efficient', 'bitset-based', 'methods', 'parallel', 'computation', 'techniques', 'conclusions', 'bufet', 'outperforms', 'state-of-the-art', 'implementation', 'regard', 'computational', 'efficiency', 'scenarios', 'single', 'multi-core', 'being', 'cases', 'order', 'magnitude', 'faster']

['diana-tarbase', 'microrna', 'tarbase', 'reference', 'database', 'devoted', 'indexing', 'experimentally', 'supported', 'microrna', 'mirna',

## Calculate the number of appearances (total_word_freq) of each word in all documents 

In [34]:
# the dictionary!
total_word_freq={}

for i in range(4):
    filename = "doc" + str(i) + ".txt"
    doc = open(filename, "r")
    text_string = doc.read().lower()
    
    # get all the words! match_word is a list containing all the words
    match_word=word_break(text_string)
    
    #for each word...
    for word in match_word:
        #check if counter exists
        if word not in total_word_freq:
           total_word_freq[word]=1
        else:
           total_word_freq[word] = total_word_freq[word] + 1
print(total_word_freq)

{'background': 1, 'group': 2, 'mirnas': 9, 'regulate': 1, 'biological': 2, 'process': 2, 'targeting': 1, 'genes': 2, 'involved': 1, 'unbiased': 3, 'mirna': 11, 'functional': 10, 'enrichment': 7, 'analysis': 4, 'precise': 1, 'silico': 3, 'approach': 3, 'predict': 2, 'processes': 1, 'regulated': 1, 'given': 2, 'however': 1, 'computationally': 1, 'intensive': 1, 'significantly': 6, 'expensive': 1, 'alternatives': 1, 'results': 3, 'introduce': 1, 'bufet': 2, 'reduce': 1, 'required': 1, 'execution': 1, 'derives': 1, 'strength': 1, 'utilization': 1, 'efficient': 1, 'bitset-based': 1, 'methods': 1, 'parallel': 1, 'computation': 1, 'techniques': 1, 'conclusions': 1, 'outperforms': 1, 'state-of-the-art': 1, 'implementation': 1, 'regard': 1, 'computational': 1, 'efficiency': 1, 'scenarios': 1, 'single': 1, 'multi-core': 1, 'being': 1, 'cases': 1, 'order': 1, 'magnitude': 1, 'faster': 1, 'diana-tarbase': 2, 'microrna': 3, 'tarbase': 2, 'reference': 1, 'database': 4, 'devoted': 1, 'indexing': 2, '

## For each word, calculate the frequency of occurence for each document.

In [35]:
# the dictionary
# Key: word, value: [(document1,frequency1),(document2,frequency2),...]
word_freq={}

for i in range(4):
    filename = "doc" + str(i) + ".txt"
    #doc = open(filename, "r")
    doc = open(filename,"r", encoding='utf-8')
    text_string = doc.read().lower()
    doc.close()
    
    # dictionary containing the frequencies of each word in this specific file
    # Key: word, value: frequency of the word in the current document
    file_word_freq={}
    
    # get all words from file i
    match_word=word_break(text_string)
    
    
    #for each word
    for word in match_word:
        # check if word exist in dictionary
        if word not in file_word_freq:
            file_word_freq[word]=1
        else:
            file_word_freq[word]+=1
    
    # For each word of the current document, add the respective document and frequency 
    # to the global dictionary
    for word in file_word_freq:
        if word not in word_freq:
            word_freq[word]=[(i,file_word_freq[word])]
        else:
            word_freq[word].append((i,file_word_freq[word]))

print(word_freq)

{'background': [(0, 1)], 'group': [(0, 2)], 'mirnas': [(0, 1), (2, 3), (3, 5)], 'regulate': [(0, 1)], 'biological': [(0, 2)], 'process': [(0, 2)], 'targeting': [(0, 1)], 'genes': [(0, 1), (3, 1)], 'involved': [(0, 1)], 'unbiased': [(0, 2), (2, 1)], 'mirna': [(0, 3), (1, 3), (2, 2), (3, 3)], 'functional': [(0, 2), (2, 2), (3, 6)], 'enrichment': [(0, 2), (3, 5)], 'analysis': [(0, 2), (3, 2)], 'precise': [(0, 1)], 'silico': [(0, 1), (2, 2)], 'approach': [(0, 2), (3, 1)], 'predict': [(0, 1), (3, 1)], 'processes': [(0, 1)], 'regulated': [(0, 1)], 'given': [(0, 1), (3, 1)], 'however': [(0, 1)], 'computationally': [(0, 1)], 'intensive': [(0, 1)], 'significantly': [(0, 2), (2, 2), (3, 2)], 'expensive': [(0, 1)], 'alternatives': [(0, 1)], 'results': [(0, 1), (3, 2)], 'introduce': [(0, 1)], 'bufet': [(0, 2)], 'reduce': [(0, 1)], 'required': [(0, 1)], 'execution': [(0, 1)], 'derives': [(0, 1)], 'strength': [(0, 1)], 'utilization': [(0, 1)], 'efficient': [(0, 1)], 'bitset-based': [(0, 1)], 'method