Basic monroe engine:
- Takes raw text
- Tokenises
- Stems words using Porter Stemmer (removes 'ing', 's'...
- Sees how many match the (stemmed) most common 1000 words
- Returns a float percentage score

List of common words from http://www.ef.co.uk/english-resources/english-vocabulary/top-1000-words/

In [155]:
# Must run this cell first - downloads the specific nltk packages needed (the ones that don't get downloaded with pip)
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Emma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [186]:
import re
import nltk
    
def tokenise(text):
    '''
    Takes raw text and returns a list of tokens
    
    At present, tokens are:
    - 2+ characters long
    - Alphabetic (numbers are automatically excluded)
    '''
    text = text.encode('ascii', 'ignore').decode('utf-8')
    tokens = re.findall("[A-Za-z]{2,}",text)
    tokens = [t for t in tokens]
    return tokens

def tag_proper_nouns(text):
    '''
    Takes raw text, tags it using a part-of-speech tagger and returns a list of the proper nouns
    
    Will be slow for large pieces of text!
    
    Requires averaged_perceptron_tagger from nltk (not all nltk packages are downloaded via pip)
    can get this package by using nltk.download() in python shell
    '''
    proper_nouns = set()
    sentences = nltk.tokenize.sent_tokenize(text)
    for sent in sentences:
        tagged_sent = nltk.tag.pos_tag(tokenise(sent))
        proper_nouns.update([word for word, pos in tagged_sent if pos == 'NNP'])
    return proper_nouns

def lowercase(tokens):
    '''
    Takes a list of tokens, makes them all lowercase and returns them
    '''
    return [t.lower() for t in tokens]

def stem(tokens):
    '''
    Takes a list of tokens, applies a stemming algorithm (returns standardised forms of words - removes
    'ing', 's'...) and returns a list of stemmed words
    
    At present:
    - We use the Porter stemmer (least aggressive form of stemming - alternates are snowball and lancaster)
    '''
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(t) for t in tokens]

def get_common():  
    '''
    Opens the text file containing the list of 1000 most common words found at
    http://www.ef.co.uk/english-resources/english-vocabulary/top-1000-words/
    removes the newlines and returns them as a list.
    '''
    text = []
    with open('1000common.txt', 'r') as f:
        for line in f:
            if line.endswith('\n'):
                text.append(line[0:-1])
            else:
                text.append(line)
    return text

def read_file(filename):
    '''
    Open a file, read text, return a string
    '''
    text = ''
    with open(filename, 'r') as f:
        for line in f:
            if line.endswith('\n'):
                text+=line[0:-1]
            else:
                text+=line
    return text


def munroe_score(text, exclusions='', verbose=True):
    '''
    Takes raw text, tokenises and stems it, and compares the stems to the set of the stemmed 1000 most common words
    Returns the percentage of words that were in the list of common words
    
    e.g. if output is 0.61, 61% of words were in the list of the 1000 most common. 
    '''
    
    # Process exclusions
    if exclusions != '':
        exclusions = lowercase(re.findall('\w+', exclusions))
    else:
        exclusions = []
    
    # Find all words - alphanumeric strings not separated by punctuation of 1+ length
    words = re.findall('\w+', text)
    
    # Keep a record of how we tagged each item
    tags = ['' for w in words]
    
    # Identify proper nouns
    proper_nouns = tag_proper_nouns(text)
    
    # Tokenise and stem the words. Mark proper nouns and non-alphabetic words in the tag list.
    tokens = []
    for i, word in enumerate(words):
        # Check if the word is a proper noun. If it is, mark it and put an empty string in the list of tokens
        if word in proper_nouns:
            tokens.append('')
            tags[i] = 'proper noun' 
        elif word.lower() in exclusions:
            tokens.append('')
            tags[i] = 'excluded'             
        else:
            token = tokenise(word)
            # If there is more than one token, it means the word was broken by a number, In this case, ignore it
            # If there are no tokens, it means that there were no alphabetic characters in the token. Ignore it
            if len(token) != 1:
                tokens.append('')
                tags[i] = 'not alphabetic'
            else:
                # Stem the word
                tokens.append(stem(lowercase(token))[0])
            
    # Get the most common 1000 w~ords from the file
    common = get_common()
    
    # Stem the words so that they match the form of our tokens
    stemmed_common = set(stem(common))
    
    # Count the number of tokens that are in the most common 1000 words
    munroe = 0
    for i, t in enumerate(tokens):
        if t != '':
            if t in stemmed_common:
                munroe+=1
                tags[i] = 'common'
            else:
                tags[i] = 'not common'
    
    
    # If verbose, return some printed output
    if verbose:
        print('You have '+ str(len(stems)) + ' words in your document')
        print('Of these, '+str(munroe)+' are in the most common 1000 words!')
        print('Score: '+str(100*munroe/len([t for t in tokens if t != '']))+'%')
        
    return_dict = {
        'score': munroe/len([t for t in tokens if t != '']),
        'tagged_words': list(zip(words,tags))
    }
    return return_dict



In [187]:
text= """The bioethics of human embryonic stem cell research (hESR) is controversial, including in Asia. After the 2001 US-moratorium on the federal funding of hESR, some Asian countries jumped into the 'bioethical vacuum', claiming that Asian countries do not suffer from Western religious scruples about using human embryos in research. Nevertheless, controversies around the donation of oocytes, the trade and barter of embryos, stem cell research trials, and human embryonic cloning in Asia have attracted global media attention. International guidelines are being adopted into diverging economic, political and socio-cultural contexts in Asia.


This comparative research asks on what basis these guidelines are adopted in a socialist developing country such as China (PRC) and in a wealthy, democratic bureaucracy such as Japan. It investigates the formulation and implementation of regulations by visiting laboratories and clinics, interviewing donors of embryos and oocytes, observing scientists that handle the ‘materials’ and analysing public debates. Studying how bioethics guidelines created by governments, medical associations and private companies impact research and international research cooperation, the research expects to provide insights into how scientists, publics and governments deal with regulatory and bioethical problems in very different economic, political and cultural contexts."""
print(text)

The bioethics of human embryonic stem cell research (hESR) is controversial, including in Asia. After the 2001 US-moratorium on the federal funding of hESR, some Asian countries jumped into the 'bioethical vacuum', claiming that Asian countries do not suffer from Western religious scruples about using human embryos in research. Nevertheless, controversies around the donation of oocytes, the trade and barter of embryos, stem cell research trials, and human embryonic cloning in Asia have attracted global media attention. International guidelines are being adopted into diverging economic, political and socio-cultural contexts in Asia.


This comparative research asks on what basis these guidelines are adopted in a socialist developing country such as China (PRC) and in a wealthy, democratic bureaucracy such as Japan. It investigates the formulation and implementation of regulations by visiting laboratories and clinics, interviewing donors of embryos and oocytes, observing scientists that 

In [191]:
score_dict = munroe_score(text, exclusions='investigates, Socialist, federal', verbose=True)

You have 184 words in your document
Of these, 124 are in the most common 1000 words!
Score: 68.50828729281768%


In [53]:
import os 
tests = os.listdir('../Tests')
for test in tests:
    print(test)
    test_text = read_file(test)
    munroe_score(test_text, verbose=True)
    print('-'*50)
    

In [192]:
score_dict['tagged_words']

[('The', 'common'),
 ('bioethics', 'not common'),
 ('of', 'common'),
 ('human', 'common'),
 ('embryonic', 'not common'),
 ('stem', 'not common'),
 ('cell', 'common'),
 ('research', 'common'),
 ('hESR', 'not common'),
 ('is', 'not common'),
 ('controversial', 'not common'),
 ('including', 'common'),
 ('in', 'common'),
 ('Asia', 'proper noun'),
 ('After', 'common'),
 ('the', 'common'),
 ('2001', 'not alphabetic'),
 ('US', 'proper noun'),
 ('moratorium', 'not common'),
 ('on', 'common'),
 ('the', 'common'),
 ('federal', 'excluded'),
 ('funding', 'common'),
 ('of', 'common'),
 ('hESR', 'not common'),
 ('some', 'common'),
 ('Asian', 'not common'),
 ('countries', 'common'),
 ('jumped', 'not common'),
 ('into', 'common'),
 ('the', 'common'),
 ('bioethical', 'not common'),
 ('vacuum', 'not common'),
 ('claiming', 'common'),
 ('that', 'common'),
 ('Asian', 'not common'),
 ('countries', 'common'),
 ('do', 'common'),
 ('not', 'common'),
 ('suffer', 'common'),
 ('from', 'common'),
 ('Western', 'co