In [6]:
from nltk import sent_tokenize, TreebankWordTokenizer, ngrams, WhitespaceTokenizer
from itertools import accumulate, tee, chain
from collections import Counter
import itertools
from datetime import datetime, date, time
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict, OrderedDict
import os, os.path
import re
import string
from decimal import Decimal
from decimal import *
getcontext().prec = 6
import statistics
from IPython.display import display, Markdown, Latex
import pymongo
from pymongo import MongoClient
from tqdm import tqdm_notebook

# Overall Settings

In [11]:
tokenization_language = 'portuguese'  # check the NLTK sentence tokenizers for available languages
non_latin_alphabet = False # if your language uses a non-Latin alphabet, change to True
files_dir_or_wiki = 'wiki' # change to 'list' if you have a list of filenames
                           # or to 'dir' if you have a specific directory
token_database = 'portuguese-tokens'
exclude_numbers = True # True / False
exclude_numbers_for_ngrams = False # True / False
number_of_ngrams = 1 # choose up to x number of n-grams to extract (minimum = 1)
threads = 4 # decrease this number if you have an old/low core processor (minimum = 1)
chosen_encoding = 'utf-8-sig' # better utf-8-sig than utf-8, saves trouble

In [12]:
if number_of_ngrams <= 1:
    ngramrange = [1]
elif number_of_ngrams > 1:
    ngramrange = range(1,number_of_ngrams+1)

In [13]:
if tokenization_language == 'japanese':
    import tinysegmenter
    tokenizer = tinysegmenter.TinySegmenter()
elif non_latin_alphabet == True:
    tokenizer = WhitespaceTokenizer()
else:
    tokenizer = TreebankWordTokenizer()

In [14]:
# Token statistics
client = MongoClient()
db = client[token_database]

token_stats = db[token_database]
token_stats.allowDiskUse=True

text_stats = token_stats.find_one({'_text-stats': True})['text_stats']

# Prerequisite functions

In [15]:
def is_number_repl_isdigit(s):
    '''Returns True is string is a number,
    i.e. if it contains dot or comma.'''
    
    return re.sub('[.,]', '', s).isdigit()

In [16]:
if non_latin_alphabet == False:

    def cleantokensfromsentence(tokens, bar_numbers):
        '''Clean a list of tokens to remove
        extraneous characters and numerals.
        Bar_numbers: if True, numbers will be deleted'''

        clean_tokens = []

        for token in tokens:

            if is_number_repl_isdigit(token) == True:
                if bar_numbers == True:
                    '''Barring numbers'''
                    pass
                elif bar_numbers == False:
                    clean_tokens.append(token)
            elif token.isalnum() is False:
                if len(token) == 1:
                    pass
                else:
                    matches = 0
                    for character in token:
                        if character.isalnum() is False:
                            matches += 1
                    if matches == len(token):
                        pass
                    else:
                        clean_tokens.append(token)
            else:
                clean_tokens.append(token)

        return clean_tokens

In [17]:
if non_latin_alphabet == True:
    
    def cleantokensfromsentence(tokens, bar_numbers):
        '''Clean a list of tokens to remove numerals.s'''

        clean_tokens = []

        for token in tokens:

            if is_number_repl_isdigit(token) == True:
                if bar_numbers == True:
                    '''Barring numbers'''
                    pass
                elif bar_numbers == False:
                    clean_tokens.append(token)
            else:
                clean_tokens.append(token)

        return clean_tokens

In [18]:
def getmultitokens(sentence, n_of_ngrams):
    '''From a list of tokens, generate up to n n-grams'''
    
    all_ngrams = []
    
    for x in range(2, n_of_ngrams+1):
        all_ngrams.extend(ngrams(sentence, x))
        
    return [' '.join(i) for i in list(chain(all_ngrams))]

In [19]:
def opentextfile(text_filename):
    global chosen_encoding
    with open(text_filename, 'r', encoding=chosen_encoding) as t:
        return t.read()

In [20]:
def calc_dp(token):
    
    global text_stats, token_stats
    
    try:
        # See if DP has already been calculated
        dp = token_stats.find_one({'token': token})['dp']
        return dp
    
    except TypeError:
        # Token not found in DB
        return float(1)
    
    except KeyError:
        # DP was not calculated yet
        # Let's calculate it then

        try:
            document = token_stats.find_one({'token': token})
            token_length = document['len']
            token_freq = document['freq']

            freq_in_files = dict(zip(document['occurred_in'],document['freq_occurred_in']))

            for file_id in freq_in_files:
                freq_in_files[file_id] = Decimal(freq_in_files[file_id]) / Decimal(token_freq)

            differences = float(0)

            for file_id in text_stats['total'][str(token_length)+'-grams']:

                expected_percentage = text_stats['total'][str(token_length)+'-grams'][file_id]['relfreq']
                expected_percentage = abs(float(expected_percentage))
                observed_percentage = abs(float(freq_in_files.get(file_id, float(0))))

                diff = abs(expected_percentage - observed_percentage)
                differences += diff
            
            dp = Decimal(differences) / Decimal(2)
            
            # Before returning DP, let us insert it to the DB
            
            result = token_stats.update_one({'token': token},
                                    {'$set': {'dp': float(dp)}})

            return float(dp)

        except TypeError:
            # Token not found in DB
            return float(1)

# The Text Difficulty Analyzer

In [21]:
def gather_tokens(text_as_string):
    
    global ngramrange
    
    gathered_tokens = {}
    
    for i in ngramrange:
        gathered_tokens[str(i)+'-grams'] = []

    # Convert to lowercase and into sentences

    textlc = text_as_string.lower()
    
    sentences = sent_tokenize(textlc, language=tokenization_language)

    for sentence in sentences:
        
        original_tokens = tokenizer.tokenize(sentence)
        
        clean_tokens = cleantokensfromsentence(original_tokens, exclude_numbers)
        gathered_tokens['1-grams'].extend(clean_tokens)
        
        if ngramrange != [1]:
                           
            mtes = getmultitokens(cleantokensfromsentence(original_tokens, exclude_numbers_for_ngrams), number_of_ngrams)

            for mte in mtes:
                gathered_tokens[str(len(mte.split(' ')))+'-grams'].append(mte)
    
    # Turn tokens into Counter dicts
    
    for key in gathered_tokens:
        gathered_tokens[key] = dict(Counter(gathered_tokens[key]))
    
    return gathered_tokens

In [22]:
def get_dp(counter):
    
    token = list(counter.keys())[0]
    
    dp = calc_dp(token)
    textfreq = list(counter.values())[0]
    
    for x in counter:
        
        return {'token': token,
            'dp': dp,
               'textfreq': textfreq}

In [23]:
def retrieve_dp_values(gathered_tokens):
    
    global threads
    
    for ngram_len in gathered_tokens:
        
        counter_dict = gathered_tokens[ngram_len]
        
        counter_list = [{x:counter_dict[x]} for x in counter_dict]
        
        pool = ThreadPool(threads)
        results = tqdm_notebook(pool.map(get_dp, counter_list))
        pool.close()
        
        for result in results:
            gathered_tokens[ngram_len][result['token']] = {'textfreq': result['textfreq'],
                                            'dp': result['dp']}
    
    return gathered_tokens

In [24]:
def calc_tds(text_as_string):
    
    retrieved_tokens = retrieve_dp_values(gather_tokens(text_as_string))

    for ngram_len in retrieved_tokens:

        token_data = retrieved_tokens[ngram_len]

        dp_values = []
        unique_dp_values = []

        for token in token_data:

            textfreq = token_data[token]['textfreq']

            try:
                dp_value = float(token_data[token]['dp'])
                
                if dp_value != None:
                    unique_dp_values.append(dp_value)
                
                for f in range(textfreq):
                    if dp_value != None:
                        # Disregard tokens that weren't found in the DB
                        dp_values.append(dp_value)
            except TypeError:
                # Token does not exist on DB
                pass

        if len(dp_values) != 0 and len(dp_values) >= 2:
            retrieved_tokens[ngram_len]['_stats_'] = {'total_dp_values':
                                                      
                                                      {'median': statistics.median(dp_values),
                                                   'mean': statistics.mean(dp_values),
                                                      },
                                                      
                                                     'unique_dp_values': 
                                                     
                                                     {'median': statistics.median(unique_dp_values),
                                                   'mean': statistics.mean(unique_dp_values),
                                                     }}

        elif len(dp_values) == 0 or len(dp_values) < 2:
            print(datetime.now(), 'statistics not computed')
            
    # The calculation of the TDS value below only considers 1-grams
    # since they're much more important than longer n-grams
    # but you can adapt it to include 2-grams and 3-grams
            
    median_total = retrieved_tokens['1-grams']['_stats_']['total_dp_values']['median']
    median_unique = retrieved_tokens['1-grams']['_stats_']['unique_dp_values']['median']
    
    retrieved_tokens['tds'] = statistics.mean([median_total, median_unique])

    return retrieved_tokens

## Difficulty Highlighter (HTML)

Export as HTML a difficulty-highlighted version of the text.<br>
It exports the HTML to the current working directory.

In [25]:
def difficulty_highlighter(text_filename):
    
    parameter = 'markdown_colors'
    
    text_as_string = opentextfile(text_filename)
    
    text_as_string = re.sub('[“”]', '"', text_as_string)
    text_as_string = re.sub('[‘’]', "'", text_as_string)
    
    print(datetime.now(), 'text file opened')
    
    retrieved_tokens = calc_tds(text_as_string)
    
    print(datetime.now(), 'tokens retrieved')
    
    tds = retrieved_tokens.pop('tds', None)
    stats = retrieved_tokens['1-grams'].pop('_stats_', None)
    
    difficulty_ranges = OrderedDict({'very easy': [0, 0.7],
                                     'easy': [0.7, 0.80],
                                     'average': [0.80, 0.90],
                                     'slightly difficult': [0.90, 0.95],
                                     'difficult': [0.95, 0.99],
                                     'very difficult': [0.99, 1.00]})

    parameters = OrderedDict({'html_colors': {'very easy': 'color:LightGrey;',
                             'easy': 'color:green;',
                             'average': 'color:blue;',
                             'difficult': 'color:black;',
                                              'slightly difficult': 'color:black;',
                             'very difficult': 'color:Crimson;'},
                  
                  'markdown': {'very easy': 'font-style: normal;',
                             'easy': 'font-style: normal;',
                             'average': 'text-decoration: italic;',
                               'slightly difficult': 'color:black;',
                             'difficult': 'font-weight: bold;',
                             'very difficult': 'font-variant: small-caps;letter-spacing: 1.5px'},
                 
                 'markdown_colors': {'very easy': 'font-style: normal;',
                                     'easy': 'font-style: italic;',
                                     'average': 'font-weight: 600;',
                                     'slightly difficult': 'font-weight: bold;color:navy;',
                                     'difficult': 'font-variant: small-caps;letter-spacing: 1.5px',
                                     'very difficult': 'font-variant: small-caps;font-weight:bold;letter-spacing: 1.5px;color:Crimson'}})
    
    
    for ngram_len in retrieved_tokens:
        
        for token in retrieved_tokens[ngram_len]:
            
            retrieved_tokens[ngram_len][token] = retrieved_tokens[ngram_len][token]['dp']
    
    token_ranges = {x:[] for x in difficulty_ranges}
    
    difficulties = []
    tokens = []
    dp_values = []
    
    for token in retrieved_tokens['1-grams']:
        
        token_dp = retrieved_tokens['1-grams'][token]
        
        for difficulty in difficulty_ranges:
            
            diffrange = difficulty_ranges[difficulty]
            
            if token_dp > diffrange[0] and token_dp <= diffrange[1]:
                
                difficulties.append(difficulty)
                token_modified = "(" + r"\b" + token + r"\b" + ")"
                tokens.append(token_modified)
                dp_values.append(token_dp)
                token_ranges[difficulty].append(token)
    
    tokens_found = '<hr><h3>Wordlist:</h3><ol>'
    for token_range in token_ranges:
        tokens_found+= "".join(["<li>",
                                "<font style='font-size:small;%s'>" % parameters[parameter][token_range],
                               token_range.capitalize(),
                                ":</font>",
                                " <font style='font-size:small;font-style:italic;'>",
                                ' | '.join(sorted(token_ranges[token_range])),
                                '</font></li>',
                               ])
    tokens_found += '</ol>'
    
    dp_values = [str(x)[:5] for x in dp_values]
    
    colors = [parameters[parameter][x] for x in difficulties]
    
    compiler = '|'.join(tokens)
    regex = re.compile(compiler, re.I)

    i = 0
    output = """<!DOCTYPE html>
    <html>
    <head>
    <link href="https://fonts.googleapis.com/css?family=Vollkorn:400,400i,600,600i,700,700i,900,900i&amp;subset=latin-ext" rel="stylesheet">
    <style>
    body {
    font-family: 'Vollkorn', 'Georgia', serif;
    }
    </style>

    </head>
    <body>
    <h1>Text Difficulty Analyzer v. 1.00</h1>
    <h3><i>Difficulty Highlighter</i></h3>
    Filename: """
    
    output = re.sub('\n', '', output)
    
    output += text_filename + '<br>Text Difficulty Value: <b>'+ str(tds)[:8] + '</b><hr>'

    legend = '<b>LEGEND:</b> '
    
    for x in parameters[parameter]:
        
        legend += "".join(["<font style='",
                           parameters[parameter][x],
                           "'>",
                           x.capitalize(),
                          '</font> | '])
    
    output += legend + "<hr><h3><font style='font-variant:small-caps;letter-spacing: 1.5px'>Highlighted Text:</font></h3>"
    
    print(datetime.now(), 'initiating difficulty highlighter')

    for m in regex.finditer(text_as_string):

        output += "".join([text_as_string[i:m.start()],
                               "<font style='%s'>" % colors[m.lastindex-1],
                               text_as_string[m.start():m.end()],
                           # Uncomment line below if you wish to have DP values alongside
                               #"</font><font style='vertical-align:super;font-size:8pt;color:LightGrey'>%s" % dp_values[m.lastindex-1],
                          "</font>"] )
        i = m.end()
    
    html = "".join([output, text_as_string[m.end():], tokens_found, "</body></html>"])
    
    html = re.sub('\n', '<br>', html)
    
    output_filename = text_filename.split('\\')[-1][:-4]+'_highlighted.html'
    
    with open(output_filename, 'w', encoding=chosen_encoding) as o:
        o.write(html)
        
    print(datetime.now(), 'exported as file',output_filename, ' -- Process complete')

    #return token_ranges
    
    #return difficulties, tokens, compiler, regex, html

In [26]:
def difficulty_highlighter_latex(text_filename):
    
    parameter = 'latex'
    
    text_as_string = opentextfile(text_filename)
    
    text_as_string = re.sub('[“”]', '"', text_as_string)
    text_as_string = re.sub('[‘’]', "'", text_as_string)
    text_as_string = re.sub('\n', "\n\n", text_as_string)
    
    print(datetime.now(), 'text file opened')
    
    retrieved_tokens = calc_tds(text_as_string)
    
    print(datetime.now(), 'tokens retrieved')
    
    tds = retrieved_tokens.pop('tds', None)
    stats = retrieved_tokens['1-grams'].pop('_stats_', None)
    
    difficulty_ranges = OrderedDict({'very easy': [0, 0.7],
                                     'easy': [0.7, 0.80],
                                     'average': [0.80, 0.90],
                                     'slightly difficult': [0.90, 0.95],
                                     'difficult': [0.95, 0.99],
                                     'very difficult': [0.99, 1.00]})

    parameters = OrderedDict({'latex':
                              {'very easy': '\\veryeasy{',
                               'easy': '\\emph{',
                               'average': '\\average{',
                               'slightly difficult': '\\slightlydifficult{',
                               'difficult': '\\difficult{',
                               'very difficult': '\\verydifficult{'},
                               
                               'latex_black':
                              {'very easy': '\\textrm{',
                               'easy': '\\emph{',
                               'average': '\\underline{',
                               'slightly difficult': '\\slightlydifficult{',
                               'difficult': '\\difficult{',
                               'very difficult': '\\verydifficult{',
                               #'font-variant: small-caps;font-weight:bold;letter-spacing: 1.5px;color:Crimson'
                              }})
    
    
    for ngram_len in retrieved_tokens:
        
        for token in retrieved_tokens[ngram_len]:
            
            retrieved_tokens[ngram_len][token] = retrieved_tokens[ngram_len][token]['dp']
    
    token_ranges = {x:[] for x in difficulty_ranges}
    
    difficulties = []
    tokens = []
    dp_values = []
    
    for token in retrieved_tokens['1-grams']:
        
        token_dp = retrieved_tokens['1-grams'][token]
        
        for difficulty in difficulty_ranges:
            
            diffrange = difficulty_ranges[difficulty]
            
            if token_dp > diffrange[0] and token_dp <= diffrange[1]:
                
                difficulties.append(difficulty)
                token_modified = "(" + r"\b" + token + r"\b" + ")"
                tokens.append(token_modified)
                dp_values.append(token_dp)
                token_ranges[difficulty].append(token)
    
    tokens_found = '\n\n\\section{Wordlist}\n\n\\begin{itemize}'
    for token_range in token_ranges:
        tokens_found+= "".join(["\n\\item ",
                                "{%s" % parameters[parameter][token_range],
                               token_range.capitalize(),
                                "}:",
                                " {\\emph{",
                                ' | '.join(sorted(token_ranges[token_range])),
                                '}}}',
                               ])
    tokens_found += '\\end{itemize}\n'
    
    dp_values = [str(x)[:5] for x in dp_values]
    
    colors = [parameters[parameter][x] for x in difficulties]
    
    compiler = '|'.join(tokens)
    regex = re.compile(compiler, re.I)

    i = 0
    output = r"""% !TEX TS-program = xelatex
    % !TEX encoding = UTF-8

    % This is a simple template for a XeLaTeX document using the "article" class,
    % with the fontspec package to easily select fonts.

    \documentclass[11pt]{article} % use larger type; default would be 10pt

    \usepackage{fontspec} % Font selection for XeLaTeX; see fontspec.pdf for documentation
    \defaultfontfeatures{Mapping=tex-text} % to support TeX conventions like ``---''
    \usepackage{xunicode} % Unicode support for LaTeX character names (accents, European chars, etc)
    \usepackage{xltxtra} % Extra customizations for XeLaTeX
    \usepackage[usenames, dvipsnames]{color}"""
    
    if parameter == 'latex_black':
        output += """
        \definecolor{crimson}{RGB}{0,0,0}
        \definecolor{navy}{RGB}{0,0,0}
        \definecolor{lightgray}{RGB}{192,192,192}
        """
        
    else:
        output += """
        \definecolor{crimson}{RGB}{255,0,64}
        \definecolor{navy}{RGB}{64,0,255}
        \definecolor{lightgray}{RGB}{192,192,192}
        """
        
    
    output += r"""
    
    \setmainfont{GaramondPremrPro-Med}[
    Extension = .otf,
    BoldFont={GaramondPremrPro-Bd},
    ItalicFont={GaramondPremrPro-MedIt},
    BoldItalicFont={GaramondPremrPro-BdIt},
    Numbers=OldStyle,
    ]
    
    
    \newfontfamily\semibold{GaramondPremrPro-Smbd}

    \newfontfamily\lightfont{GaramondPremrPro}

    \newcommand{\verydifficult}[1] {\begingroup\textsc{\textbf{\textcolor{crimson}{#1}}}\endgroup}

    \newcommand{\difficult}[1] {\begingroup
\lightfont{\textsc{#1}}\endgroup}

    \newcommand{\slightlydifficult}[1] {\begingroup \textcolor{navy}{\textbf{#1}}\endgroup}
    
    \newcommand{\average}[1]{\begingroup \semibold{#1}\endgroup}
    
    \newcommand{\veryeasy}[1]{{#1}}
    
    \newcommand{\dpsuper}[1] {
    \lightfont{\textcolor{lightgray}{\textsuperscript{#1}}}
    }
    

    % other LaTeX packages.....
    \usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
    \geometry{a4paper} % or letterpaper (US) or a5paper or....
    %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent

    \usepackage{graphicx} % support the \includegraphics command and options

    \title{Brief Article}
    \author{The Author}
    %\date{} % Activate to display a given date or no date (if empty),
             % otherwise the current date is printed 

    \begin{document}
    \maketitle
    """
    
    #output = re.sub('\n', '', output)
    
    filename = text_filename.split('\\')[-1]
    filename = re.sub("_", '\_', filename)
    
    output += filename + '\n\nText Difficulty Scale: \\textbf{'+ str(tds)[:8] + '}\n'

    legend = '\n\\textbf{LEGEND ---} '
    
    for x in parameters[parameter]:
        
        legend += "".join(["",
                           parameters[parameter][x],
                           "",
                           x.capitalize(),
                          '} | '])
    
    output += "\n\\section{Highlighted Text}" + '\\bigskip \n\n' + legend + '\\bigskip \n\n'
    
    print(datetime.now(), 'initiating difficulty highlighter')

    for m in regex.finditer(text_as_string):

        output += "".join([text_as_string[i:m.start()],
                               "%s" % colors[m.lastindex-1],
                               text_as_string[m.start():m.end()],
                           # Uncomment line below if you wish to have DP values alongside
                               "}\\dpsuper{%s" % dp_values[m.lastindex-1],
                          "}"] )
        i = m.end()
    
    html = "".join([output, text_as_string[m.end():], tokens_found, "\\end{document}"])
    
    output_filename = text_filename.split('\\')[-1][:-4] + '_highlighted.tex'

    with open(output_filename, 'w', encoding=chosen_encoding) as o:
        o.write(html)
    
    print(datetime.now(), 'writing to file', output_filename, ' -- Process complete')

In [27]:
difficulty_highlighter('bach.txt')

2018-09-10 15:07:40.386867 text file opened



2018-09-10 15:07:47.379357 tokens retrieved
2018-09-10 15:07:49.117736 initiating difficulty highlighter
2018-09-10 15:07:50.551873 exported as file bach_highlighted.html  -- Process complete
