# Corpus builder
Here, we will build a corpus using Pymongo as a database.
It will install automatically the missing libraries.

You only need to run this notebook once.

In [1]:
from nltk import sent_tokenize, TreebankWordTokenizer, ngrams, WhitespaceTokenizer
from itertools import accumulate, tee, chain
from collections import Counter
import itertools
from datetime import datetime, date, time
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict, OrderedDict
import os, os.path
import re
import string
from decimal import Decimal
from decimal import *
getcontext().prec = 6
import statistics
from IPython.display import display, Markdown, Latex

In [2]:
# Install necessary libraries if they're not available
import sys

try:
    from tqdm import tqdm_notebook
except:
    !conda install --yes --prefix {sys.prefix} tqdm
    from tqdm import tqdm_notebook

try:
    import pymongo
    from pymongo import MongoClient
except:
    !conda install --yes --prefix {sys.prefix} pymongo
    import pymongo
    from pymongo import MongoClient

# Overall Settings

In [3]:
tokenization_language = 'english'  # check the NLTK sentence tokenizers for available languages
non_latin_alphabet = False # if your language uses a non-Latin alphabet, change to True
files_dir_or_wiki = 'wiki' # change to 'list' if you have a list of filenames
                           # or to 'dir' if you have a specific directory
token_database = 'tokens'
exclude_numbers = True # True / False
exclude_numbers_for_ngrams = False # True / False
number_of_ngrams = 1 # choose up to x number of n-grams to extract (minimum = 1)
threads = 4 # decrease this number if you have an old/low core processor (minimum = 1)
chosen_encoding = 'utf-8-sig' # better utf-8-sig than utf-8, saves trouble

In [4]:
if tokenization_language == 'japanese':
    import tinysegmenter
    tokenizer = tinysegmenter.TinySegmenter()
elif non_latin_alphabet == True:
    tokenizer = WhitespaceTokenizer()
else:
    tokenizer = TreebankWordTokenizer()

Execute the mongod file before running next cell.

In [8]:
# Token statistics
client = MongoClient()
db = client[token_database]

token_stats = db[token_database]
token_stats.allowDiskUse=True

In [9]:
# Try getting our text data (if it already exists)
try:
    text_stats = token_stats.find_one({'_text-stats': True})['text_stats']
except:
    text_stats = {}

In [10]:
# Delete DB and Indexes, if anything goes wrong
#token_stats.drop()

In [11]:
# Create indexes for faster updates / retrievals
for value in ['token', 'disp', 'freq', 'len', 'occurred_in', 'dp', '_text-stats']:
    index = token_stats.create_index([(value, pymongo.ASCENDING)])

index_occurred_len = token_stats.create_index([('occurred_in', pymongo.ASCENDING),
                                              ('len', pymongo.ASCENDING)])

# Prerequisite functions

In [12]:
def is_number_repl_isdigit(s):
    '''Returns True is string is a number,
    i.e. if it contains dot or comma.'''
    
    return re.sub('[.,]', '', s).isdigit()

In [13]:
if non_latin_alphabet == False:

    def cleantokensfromsentence(tokens, bar_numbers):
        '''Clean a list of tokens to remove
        extraneous characters and numerals.
        Bar_numbers: if True, numbers will be deleted'''

        clean_tokens = []

        for token in tokens:

            if is_number_repl_isdigit(token) == True:
                if bar_numbers == True:
                    '''Barring numbers'''
                    pass
                elif bar_numbers == False:
                    clean_tokens.append(token)
            elif token.isalnum() is False:
                if len(token) == 1:
                    pass
                else:
                    matches = 0
                    for character in token:
                        if character.isalnum() is False:
                            matches += 1
                    if matches == len(token):
                        pass
                    else:
                        clean_tokens.append(token)
            else:
                clean_tokens.append(token)

        return clean_tokens

In [14]:
if non_latin_alphabet == True:
    
    def cleantokensfromsentence(tokens, bar_numbers):
        '''Clean a list of tokens to remove numerals.s'''

        clean_tokens = []

        for token in tokens:

            if is_number_repl_isdigit(token) == True:
                if bar_numbers == True:
                    '''Barring numbers'''
                    pass
                elif bar_numbers == False:
                    clean_tokens.append(token)
            else:
                clean_tokens.append(token)

        return clean_tokens

In [15]:
def getmultitokens(sentence, n_of_ngrams):
    '''From a list of tokens, generate up to n n-grams'''
    
    all_ngrams = []
    
    for x in range(2, n_of_ngrams+1):
        all_ngrams.extend(ngrams(sentence, x))
        
    return [' '.join(i) for i in list(chain(all_ngrams))]

In [16]:
def opentextfile(text_filename):
    global chosen_encoding
    with open(text_filename, 'r', encoding=chosen_encoding) as t:
        return t.read()

In [17]:
def process_sentence(sentence):
    
    global number_of_ngrams, tokenization_language
    global exclude_numbers, exclude_numbers_for_ngrams

    tempdict = {}

    # Barring numbers on single tokens
    
    if tokenization_language == 'japanese':
        global segmenter
        original_tokens = segmenter.tokenize(sentence)
    else:
        original_tokens = tokenizer.tokenize(sentence)
    
    for token in cleantokensfromsentence(original_tokens,exclude_numbers):
        addtodict(token, {'token': token,
                         'freq': 1}, tempdict)

    
    if number_of_ngrams > 1:

        for token in getmultitokens(cleantokensfromsentence(original_tokens,exclude_numbers_for_ngrams), number_of_ngrams):
            addtodict(token, {'token': token,
                             'freq': 1}, tempdict)
    
    del original_tokens

    return tempdict

In [18]:
def addtodict(key, value, data_dict):
    '''Update a dictionary with a key and value.
    If key exists, combines the values.'''
    
    if type(value) == int:
        # Sums existing value to current value
        new_value = data_dict.get(key, 0) + value
    
    elif type(value) == str:
        # Keep existing value
        new_value = data_dict.get(key, value)
    
    elif type(value) == dict:
        new_value = data_dict.get(key, {})
        
        for d1_key in value:
            addtodict(d1_key, value[d1_key], new_value)

    data_dict[key] = new_value

In [19]:
def textdata(text_filename):
    
    global tokenization_language, non_latin_alphabet
    
    token_data = {}
    
    text_as_string = opentextfile(text_filename)
    
    # Convert to lowercase

    textlc = text_as_string.lower()
    del text_as_string
    
    # Uncomment this next line of code
    # if you are working with a Wikipedia dump file
    
    #textlc = textlc.split('">\n')[1]
    
    results = []

    sentences = sent_tokenize(textlc, language=tokenization_language)
    
    del textlc
    
    for sentence in sentences:
        results.append(process_sentence(sentence))
        
    for tempdict in results:
        
        for key in tempdict:
            addtodict(key, tempdict[key], token_data)

    return list(token_data.values())

In [20]:
def update_token(token_entry):
    
    sub_entry = 'freq_in_file' + '.' + str(token_entry['file_id'])
    
    result = token_stats.update_one({'token': token_entry['token']},
                                        {'$inc': {'freq': token_entry['freq'],
                                                 'disp': 1},
                                         '$setOnInsert': {'len': len(token_entry['token'].split(' '))},
                                         '$push': {'occurred_in': str(token_entry['file_id']),
                                                  'freq_occurred_in': token_entry['freq']}},
                                    upsert=True)

In [21]:
def update_db(file_id):
    
    global token_stats, file_dict

    text_filename = file_dict[file_id]
    
    token_entries = textdata(text_filename)

    for token_entry in token_entries:
        token_entry['file_id'] = file_id
    
    pool = ThreadPool(threads)
    results = pool.map(update_token, token_entries)
    pool.close()

In [22]:
def get_text_freq(file_id):
    
    global text_stats, token_stats, n_of_text_files, step, ngramrange
    
    for n_of_ngrams in ngramrange:

        text_freq = token_stats.aggregate([{'$match': 
                                            {'occurred_in': file_id,
                                             'len': n_of_ngrams}},
                                           {'$group': {'_id': None,
                                        
                                        'freq': 
                                          {'$sum': {'$arrayElemAt': 
                                           ['$freq_occurred_in', 
                                            {'$indexOfArray':
                                             ['$occurred_in', file_id]}]}}}}])

        for result in text_freq:
            text_stats['total'][str(n_of_ngrams)+'-grams'][file_id] = result['freq']

#     n_of_text_files += 1
    
#     if n_of_text_files % step == 0:
#         print(datetime.now(), n_of_text_files, 'text files processed')

<h1>Text file location</h1>

In [23]:
if files_dir_or_wiki == 'list':
    # Change to the list of filenames of your choice
    list_of_files_path = 'chosen_articles.txt'
    text_files = open(list_of_files_path, 'r', encoding=chosen_encoding)
    text_filenames = [x[:-1] for x in text_files.readlines()]

elif files_dir_or_wiki == 'dir':
    # Change to the text file directory of your choice
    text_file_directory = '.\\chosen_articles\\'
    text_filenames = [os.path.join(text_file_directory,f.name) for f in os.scandir(text_file_directory) if f.is_file()]

elif files_dir_or_wiki == 'wiki':
    # Point to Wikipedia extracted article directory
    text_file_directory = '.\\output_dir\\'
    text_filenames = [os.path.join(text_file_directory,f.name) for f in os.scandir(text_file_directory) if f.is_file()]

In [24]:
# Create a dictionary of filenames
# in order to generate number-filename "code"

f_n = 0
file_dict = {}
for filename in text_filenames:
    file_dict[str(f_n)] = filename
    f_n += 1
del text_filenames

In [25]:
# Calculating corpus parts

if number_of_ngrams <= 1:
    ngramrange = [1]
elif number_of_ngrams > 1:
    ngramrange = range(1,number_of_ngrams+1)

Here, you will build your corpus by extracting basic data
(frequency, dispersion, the frequency of tokens in a given text, etc.)

In [26]:
already_processed = token_stats.distinct('occurred_in')
print(len(already_processed), 'files already processed')

for file_id in already_processed:
    del file_dict[file_id]

n_files = 0
step = ((len(file_dict)) // 100) + 1
half = len(file_dict) // 2
print(len(file_dict), 'files remaining with a step of', step, ', half:', half)

0 files already processed
100 files remaining with a step of 2 , half: 50


In [27]:
print(datetime.now(), 'started')
for file_id in tqdm_notebook(list(file_dict.keys())):
    update_db(file_id)
print(datetime.now(), 'ended')

2018-09-10 14:17:31.329844 started



2018-09-10 14:17:58.357932 ended


<h1>Obtain Text Data</h1>

A prerequisite for calculating DP in the next Part.

In [28]:
# Calculating corpus parts

file_ids = token_stats.distinct('occurred_in')

text_stats = {'total': {}}

for key in tqdm_notebook(text_stats):

    for n_of_ngrams in tqdm_notebook(ngramrange):
        text_stats[key][str(n_of_ngrams)+'-grams'] = {}




In [29]:
n_of_text_files = 0
step = len(file_ids) // 1000
half = len(file_ids) // 2

print(datetime.now(), 'started calculating overall frequencies')

pool = ThreadPool(threads)
results = pool.map(get_text_freq, tqdm_notebook(file_ids))
pool.close()
        
print(datetime.now(), 'ended calculating overall frequencies')

2018-09-10 14:18:27.037231 started calculating overall frequencies



2018-09-10 14:18:33.353892 ended calculating overall frequencies


In [30]:
# Calculating normalized frequencies

for token_length in text_stats['total']:
    overall_freq = sum(text_stats['total'][token_length].values())
    
    for file_id in text_stats['total'][token_length]:
        absfreq = text_stats['total'][token_length][file_id]
        relfreq = Decimal(absfreq) / Decimal(overall_freq)
        text_stats['total'][token_length][file_id] = {'absfreq': absfreq,
                                                     'relfreq': float(relfreq)}

In [31]:
text_stats_insert = token_stats.insert_one({'_text-stats': True,
                                  'text_stats': text_stats})

In [32]:
text_stats = token_stats.find_one({'_text-stats': True})['text_stats']

# Part III: Calculate DP by Gries

Calculate Stephan Th. Gries's (2008, 2010) "deviation of proportions", in order to see how well dispersed tokens are in your corpus

In [33]:
def calc_dp(token):
    
    global text_stats, token_stats
    
    try:
        # See if DP has already been calculated
        dp = token_stats.find_one({'token': token})['dp']
        return dp
    
    except TypeError:
        # Token not found in DB
        return float(1)
    
    except KeyError:
        # DP was not calculated yet
        # Let's calculate it then

        try:
            document = token_stats.find_one({'token': token})
            token_length = document['len']
            token_freq = document['freq']

            freq_in_files = dict(zip(document['occurred_in'],document['freq_occurred_in']))

            for file_id in freq_in_files:
                freq_in_files[file_id] = Decimal(freq_in_files[file_id]) / Decimal(token_freq)

            differences = float(0)

            for file_id in text_stats['total'][str(token_length)+'-grams']:

                expected_percentage = text_stats['total'][str(token_length)+'-grams'][file_id]['relfreq']
                expected_percentage = abs(float(expected_percentage))
                observed_percentage = abs(float(freq_in_files.get(file_id, float(0))))

                diff = abs(expected_percentage - observed_percentage)
                differences += diff
            
            dp = Decimal(differences) / Decimal(2)
            
            # Before returning DP, let us insert it to the DB
            
            result = token_stats.update_one({'token': token},
                                    {'$set': {'dp': float(dp)}})

            return float(dp)

        except TypeError:
            # Token not found in DB
            return float(1)

In [36]:
# If this operation fails,
# try dividing the MongoDB request more.
# For instance:
# first you run this cell for tokens with more than 1,000 occurrences
# all_tokens = token_stats.distinct('token', {'freq': {'$gt': 1000}})
# then you run this cell for tokens with less than 1,000 occurrences
# all_tokens = token_stats.distinct('token', {'freq': {'$lt': 1000})
# It will all depend on the size of your data.

In [35]:
all_tokens = token_stats.distinct('token', {'dp': {'$exists': False}})
tokens_processed = 0

print(datetime.now(), 'adding DP values for', len(all_tokens), 'tokens')
pool = ThreadPool(threads)
results = pool.map(calc_dp, tqdm_notebook(all_tokens))
pool.close()
print(datetime.now(), 'finished adding DP values for', len(all_tokens), 'tokens')

2018-09-10 14:18:55.107166 adding DP values for 9661 tokens



2018-09-10 14:19:08.619131 finished adding DP values for 9661 tokens
