# Corpus builder
Here, we will build a corpus using Pymongo as a database.
It will install automatically the missing libraries.

You only need to run this notebook once.

In [1]:
from nltk import sent_tokenize, TreebankWordTokenizer, ngrams, WhitespaceTokenizer
from itertools import accumulate, tee, chain
from collections import Counter
import itertools
from datetime import datetime, date, time
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict, OrderedDict
import os, os.path
import re
import string
from decimal import Decimal
from decimal import *
getcontext().prec = 6
import statistics
from IPython.display import display, Markdown, Latex

In [2]:
# Install necessary libraries if they're not available
import sys

try:
    from tqdm import tqdm_notebook
except:
    try:
        !conda install --yes --prefix {sys.prefix} tqdm
    except:
        !{sys.executable} -m pip install tqdm

try:
    import pymongo
    from pymongo import MongoClient
except:
    try:
        !conda install --yes --prefix {sys.prefix} pymongo
    except:
        !{sys.executable} -m pip install pymongo


# Overall Settings

In [3]:
tokenization_language = 'persian'  # check the NLTK sentence tokenizers for available languages
non_latin_alphabet = True # if your language uses a non-Latin alphabet, change to True
files_dir_or_wiki = 'wiki' # change to 'list' if you have a list of filenames
                           # or to 'dir' if you have a specific directory
token_database = 'tokens-persian'
exclude_numbers = True # True / False
exclude_numbers_for_ngrams = False # True / False
number_of_ngrams = 1 # choose up to x number of n-grams to extract (minimum = 1)
threads = 4 # decrease this number if you have an old/low core processor (minimum = 1)
chosen_encoding = 'utf-8-sig' # better utf-8-sig than utf-8, saves trouble

In [4]:
if non_latin_alphabet == False:
    tokenizer = TreebankWordTokenizer()

elif non_latin_alphabet == True:
    
    try:
        from polyglot.downloader import downloader
    except ModuleNotFoundError:
        !{sys.executable} -m pip install polyglot
    
    try:
        import PyICU, icu
    except ModuleNotFoundError:
        !{sys.executable} -m pip install PyICU
    
    try:
        import pycld2
    except ModuleNotFoundError:
        !{sys.executable} -m pip install pycld2 

  if sys.path[0] == '':


In [5]:
# Installing language-specific module

from polyglot.downloader import downloader

In [6]:
# List of ICU-supported languages

langs = {'afrikaans': 'af',
 'alemannic': 'als',
 'amharic': 'am',
 'aragonese': 'an',
 'arabic': 'ar',
 'egyptian arabic': 'arz',
 'assamese': 'as',
 'asturian': 'ast',
 'azerbaijani': 'az',
 'bashkir': 'ba',
 'bavarian': 'bar',
 'belarusian': 'be',
 'bulgarian': 'bg',
 'bangla': 'bn',
 'tibetan': 'bo',
 'bishnupriya': 'bpy',
 'breton': 'br',
 'bosnian': 'bs',
 'catalan': 'ca',
 'chechen': 'ce',
 'cebuano': 'ceb',
 'czech': 'cs',
 'chuvash': 'cv',
 'welsh': 'cy',
 'danish': 'da',
 'german': 'de',
 'zazaki': 'diq',
 'divehi': 'dv',
 'greek': 'el',
 'english': 'en',
 'esperanto': 'eo',
 'spanish': 'es',
 'estonian': 'et',
 'basque': 'eu',
 'persian': 'fa',
 'finnish': 'fi',
 'faroese': 'fo',
 'french': 'fr',
 'western frisian': 'fy',
 'irish': 'ga',
 'gan chinese': 'gan',
 'scottish gaelic': 'gd',
 'galician': 'gl',
 'gujarati': 'gu',
 'manx': 'gv',
 'hebrew': 'he',
 'hindi': 'hi',
 'fiji hindi': 'hif',
 'croatian': 'hr',
 'upper sorbian': 'hsb',
 'haitian creole': 'ht',
 'hungarian': 'hu',
 'armenian': 'hy',
 'interlingua': 'ia',
 'indonesian': 'id',
 'iloko': 'ilo',
 'ido': 'io',
 'icelandic': 'is',
 'italian': 'it',
 'japanese': 'ja',
 'javanese': 'jv',
 'georgian': 'ka',
 'kazakh': 'kk',
 'khmer': 'km',
 'kannada': 'kn',
 'korean': 'ko',
 'kurdish': 'ku',
 'kyrgyz': 'ky',
 'latin': 'la',
 'luxembourgish': 'lb',
 'limburgish': 'li',
 'lombard': 'lmo',
 'lithuanian': 'lt',
 'latvian': 'lv',
 'malagasy': 'mg',
 'macedonian': 'mk',
 'malayalam': 'ml',
 'mongolian': 'mn',
 'marathi': 'mr',
 'malay': 'ms',
 'maltese': 'mt',
 'burmese': 'my',
 'nepali': 'ne',
 'dutch': 'nl',
 'norwegian nynorsk': 'nn',
 'norwegian': 'no',
 'occitan': 'oc',
 'odia': 'or',
 'ossetic': 'os',
 'punjabi': 'pa',
 'pampanga': 'pam',
 'polish': 'pl',
 'piedmontese': 'pms',
 'pashto': 'ps',
 'portuguese': 'pt',
 'quechua': 'qu',
 'romansh': 'rm',
 'romanian': 'ro',
 'russian': 'ru',
 'sanskrit': 'sa',
 'sakha': 'sah',
 'sicilian': 'scn',
 'scots': 'sco',
 'northern sami': 'se',
 'serbo-croatian': 'sh',
 'sinhala': 'si',
 'slovak': 'sk',
 'slovenian': 'sl',
 'albanian': 'sq',
 'serbian': 'sr',
 'sundanese': 'su',
 'swedish': 'sv',
 'swahili': 'sw',
 'silesian': 'szl',
 'tamil': 'ta',
 'telugu': 'te',
 'tajik': 'tg',
 'thai': 'th',
 'turkmen': 'tk',
 'tagalog': 'tl',
 'turkish': 'tr',
 'tatar': 'tt',
 'uyghur': 'ug',
 'ukrainian': 'uk',
 'urdu': 'ur',
 'uzbek': 'uz',
 'venetian': 'vec',
 'vietnamese': 'vi',
 'west flemish': 'vls',
 'volapük': 'vo',
 'walloon': 'wa',
 'waray': 'war',
 'yiddish': 'yi',
 'yoruba': 'yo',
 'chinese': 'zh',
 'chinese character': 'zhc',
 'chinese word': 'zhw'}

In [7]:
if tokenization_language.lower() in langs:
    
    token_models = []
    
    for lang in langs:
        code = langs[lang]
        
        if tokenization_language in lang:
            token_models.append(code)
    
    print(tokenization_language.capitalize(), 'supported with models:', token_models)

elif tokenization_language.lower() not in langs:
    print(tokenization_language.capitalize(), 'not supported. Reverting to Treebank')
    tokenizer = TreebankWordTokenizer()
    

Persian supported with models: ['fa']


In [8]:
# Downloading polyglot language models for the language you've chosen

for model in token_models:
    
    downloader.download('LANG:'+model)

[polyglot_data] Downloading collection 'LANG:fa'
[polyglot_data]    | 
[polyglot_data]    | Downloading package sgns2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package unipos.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package ner2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package counts2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package transliteration2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package embeddings2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package uniemb.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package sentiment2.fa to
[polyglot_data]    |     /home/filipe/polyglot_data...
[polyglot_data]    | Downloading package

In [9]:
try:
    from polyglot.text import Text
except ModuleNotFoundError:
    !{sys.executable} -m pip install morfessor

from polyglot.text import Text

In [10]:
def icu_tokenizer(text_as_string):
    
    global token_models
    
    text = Text(text_as_string)
    text.language = token_models[0]
    
    return [str(word) for word in text.words]

In [11]:
def icu_sentence_tokenizer(text_as_string):
    
    global token_models
    
    text = Text(text_as_string)
    text.language = token_models[0]
    
    return [str(sentence) for sentence in text.sentences]

Execute the mongod file before running next cell.

In [12]:
# Token statistics
client = MongoClient()
db = client[token_database]

token_stats = db[token_database]
token_stats.allowDiskUse=True

In [13]:
# Try getting our text data (if it already exists)
try:
    text_stats = token_stats.find_one({'_text-stats': True})['text_stats']
except:
    text_stats = {}

In [14]:
# Delete DB and Indexes, if anything goes wrong
#token_stats.drop()

In [15]:
# Create indexes for faster updates / retrievals
for value in ['token', 'disp', 'freq', 'len', 'occurred_in', 'dp', '_text-stats']:
    index = token_stats.create_index([(value, pymongo.ASCENDING)])

index_occurred_len = token_stats.create_index([('occurred_in', pymongo.ASCENDING),
                                              ('len', pymongo.ASCENDING)])

# Prerequisite functions

In [16]:
def is_number_repl_isdigit(s):
    '''Returns True is string is a number,
    i.e. if it contains dot or comma.'''
    
    return re.sub('[.,]', '', s).isdigit()

In [17]:
def cleantokensfromsentence(tokens, bar_numbers):
    '''Clean a list of tokens to remove
    extraneous characters and numerals.
    Bar_numbers: if True, numbers will be deleted'''

    clean_tokens = []

    for token in tokens:

        if is_number_repl_isdigit(token) == True:
            if bar_numbers == True:
                '''Barring numbers'''
                pass
            elif bar_numbers == False:
                clean_tokens.append(token)
        elif token.isalnum() is False:
            if len(token) == 1:
                pass
            else:
                matches = 0
                for character in token:
                    if character.isalnum() is False:
                        matches += 1
                if matches == len(token):
                    pass
                else:
                    clean_tokens.append(token)
        else:
            clean_tokens.append(token)

    return clean_tokens

In [18]:
def getmultitokens(sentence, n_of_ngrams):
    '''From a list of tokens, generate up to n n-grams'''
    
    all_ngrams = []
    
    for x in range(2, n_of_ngrams+1):
        all_ngrams.extend(ngrams(sentence, x))
        
    return [' '.join(i) for i in list(chain(all_ngrams))]

In [19]:
def opentextfile(text_filename):
    global chosen_encoding
    with open(text_filename, 'r', encoding=chosen_encoding) as t:
        return t.read()

In [20]:
def addtodict(key, value, data_dict):
    '''Update a dictionary with a key and value.
    If key exists, combines the values.'''
    
    if type(value) == int:
        # Sums existing value to current value
        new_value = data_dict.get(key, 0) + value
    
    elif type(value) == str:
        # Keep existing value
        new_value = data_dict.get(key, value)
    
    elif type(value) == dict:
        new_value = data_dict.get(key, {})
        
        for d1_key in value:
            addtodict(d1_key, value[d1_key], new_value)

    data_dict[key] = new_value

In [21]:
def process_sentence(sentence):
    
    global number_of_ngrams, non_latin_alphabet
    global exclude_numbers, exclude_numbers_for_ngrams

    tempdict = {}

    # Barring numbers on single tokens
    
    if non_latin_alphabet == False:
        original_tokens = tokenizer.tokenize(sentence)
    elif non_latin_alphabet == True:
        original_tokens = icu_tokenizer(sentence)
    
    for token in cleantokensfromsentence(original_tokens,exclude_numbers):
        addtodict(token, {'token': token,
                         'freq': 1}, tempdict)
    
    if number_of_ngrams > 1:

        for token in getmultitokens(cleantokensfromsentence(original_tokens,exclude_numbers_for_ngrams), number_of_ngrams):
            addtodict(token, {'token': token,
                             'freq': 1}, tempdict)
    
    del original_tokens

    return tempdict

In [22]:
def textdata(text_filename):
    
    global tokenization_language, non_latin_alphabet
    
    token_data = {}
    
    text_as_string = opentextfile(text_filename)
    
    # Convert to lowercase

    textlc = text_as_string.lower()
    del text_as_string
    
    # Uncomment this next line of code
    # if you are working with a Wikipedia dump file
    
    #textlc = textlc.split('">\n')[1]
    
    results = []
    
    if non_latin_alphabet == False:
        sentences = sent_tokenize(textlc, language=tokenization_language)
    elif non_latin_alphabet == True:
        sentences = icu_sentence_tokenizer(textlc)
    
    del textlc
    
    for sentence in sentences:
        results.append(process_sentence(sentence))
        
    for tempdict in results:
        
        for key in tempdict:
            addtodict(key, tempdict[key], token_data)

    return list(token_data.values())

In [23]:
def update_token(token_entry):
    
    sub_entry = 'freq_in_file' + '.' + str(token_entry['file_id'])
    
    result = token_stats.update_one({'token': token_entry['token']},
                                        {'$inc': {'freq': token_entry['freq'],
                                                 'disp': 1},
                                         '$setOnInsert': {'len': len(token_entry['token'].split(' '))},
                                         '$push': {'occurred_in': str(token_entry['file_id']),
                                                  'freq_occurred_in': token_entry['freq']}},
                                    upsert=True)

In [24]:
def update_db(file_id):
    
    global token_stats, file_dict

    text_filename = file_dict[file_id]
    
    token_entries = textdata(text_filename)

    for token_entry in token_entries:
        token_entry['file_id'] = file_id
    
    pool = ThreadPool(threads)
    results = pool.map(update_token, token_entries)
    pool.close()

In [25]:
def get_text_freq(file_id):
    
    global text_stats, token_stats, n_of_text_files, step, ngramrange
    
    for n_of_ngrams in ngramrange:

        text_freq = token_stats.aggregate([{'$match': 
                                            {'occurred_in': file_id,
                                             'len': n_of_ngrams}},
                                           {'$group': {'_id': None,
                                        
                                        'freq': 
                                          {'$sum': {'$arrayElemAt': 
                                           ['$freq_occurred_in', 
                                            {'$indexOfArray':
                                             ['$occurred_in', file_id]}]}}}}])

        for result in text_freq:
            text_stats['total'][str(n_of_ngrams)+'-grams'][file_id] = result['freq']

#     n_of_text_files += 1
    
#     if n_of_text_files % step == 0:
#         print(datetime.now(), n_of_text_files, 'text files processed')

<h1>Text file location</h1>

In [26]:
if files_dir_or_wiki == 'list':
    # Change to the list of filenames of your choice
    list_of_files_path = 'chosen_articles.txt'
    text_files = open(list_of_files_path, 'r', encoding=chosen_encoding)
    text_filenames = [x[:-1] for x in text_files.readlines()]

elif files_dir_or_wiki == 'dir':
    # Change to the text file directory of your choice
    text_file_directory = './/chosen_articles//'
    text_filenames = [os.path.join(text_file_directory,f.name) for f in os.scandir(text_file_directory) if f.is_file()]

elif files_dir_or_wiki == 'wiki':
    # Point to Wikipedia extracted article directory
    text_file_directory = './/output_dir//'
    text_filenames = [os.path.join(text_file_directory,f.name) for f in os.scandir(text_file_directory) if f.is_file()]

In [27]:
# Create a dictionary of filenames
# in order to generate number-filename "code"

f_n = 0
file_dict = {}
for filename in text_filenames:
    file_dict[str(f_n)] = filename
    f_n += 1
del text_filenames

In [28]:
# Calculating corpus parts

if number_of_ngrams <= 1:
    ngramrange = [1]
elif number_of_ngrams > 1:
    ngramrange = range(1,number_of_ngrams+1)

Here, you will build your corpus by extracting basic data
(frequency, dispersion, the frequency of tokens in a given text, etc.)

In [29]:
already_processed = token_stats.distinct('occurred_in')
print(len(already_processed), 'files already processed')

for file_id in already_processed:
    del file_dict[file_id]

n_files = 0
step = ((len(file_dict)) // 100) + 1
half = len(file_dict) // 2
print(len(file_dict), 'files remaining with a step of', step, ', half:', half)

0 files already processed
2627 files remaining with a step of 27 , half: 1313


In [30]:
try:
    print(datetime.now(), 'started')
    for file_id in tqdm_notebook(list(file_dict.keys())):
        update_db(file_id)
    print(datetime.now(), 'ended')
except LookupError:
    import nltk
    nltk.download('punkt')

2018-09-13 07:23:04.148056 started


HBox(children=(IntProgress(value=0, max=2627), HTML(value='')))

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete


2018-09-13 07:31:20.707760 ended


<h1>Obtain Text Data</h1>

A prerequisite for calculating DP in the next Part.

In [31]:
# Calculating corpus parts

file_ids = token_stats.distinct('occurred_in')

text_stats = {'total': {}}

for key in tqdm_notebook(text_stats):

    for n_of_ngrams in tqdm_notebook(ngramrange):
        text_stats[key][str(n_of_ngrams)+'-grams'] = {}

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [32]:
n_of_text_files = 0
step = len(file_ids) // 1000
half = len(file_ids) // 2

print(datetime.now(), 'started calculating overall frequencies')

pool = ThreadPool(threads)
results = pool.map(get_text_freq, tqdm_notebook(file_ids))
pool.close()
        
print(datetime.now(), 'ended calculating overall frequencies')

2018-09-13 07:50:06.649081 started calculating overall frequencies


HBox(children=(IntProgress(value=0, max=2627), HTML(value='')))


2018-09-13 07:50:11.435217 ended calculating overall frequencies


In [33]:
# Calculating normalized frequencies

for token_length in text_stats['total']:
    overall_freq = sum(text_stats['total'][token_length].values())
    
    for file_id in text_stats['total'][token_length]:
        absfreq = text_stats['total'][token_length][file_id]
        relfreq = Decimal(absfreq) / Decimal(overall_freq)
        text_stats['total'][token_length][file_id] = {'absfreq': absfreq,
                                                     'relfreq': float(relfreq)}

In [34]:
text_stats_insert = token_stats.insert_one({'_text-stats': True,
                                  'text_stats': text_stats})

In [35]:
text_stats = token_stats.find_one({'_text-stats': True})['text_stats']

# Part III: Calculate DP by Gries

Calculate Stephan Th. Gries's (2008, 2010) "deviation of proportions", in order to see how well dispersed tokens are in your corpus

In [51]:
def calc_dp(token):
    
    global text_stats, token_stats
    
    try:
        # See if DP has already been calculated
        dp = token_stats.find_one({'token': token})['dp']
        return dp
    
    except TypeError:
        # Token not found in DB
        return float(1)
    
    except KeyError:
        # DP was not calculated yet
        # Let's calculate it then

        try:
            document = token_stats.find_one({'token': token})
            token_length = document['len']
            token_freq = document['freq']

            freq_in_files = dict(zip(document['occurred_in'],document['freq_occurred_in']))

            for file_id in freq_in_files:
                freq_in_files[file_id] = Decimal(freq_in_files[file_id]) / Decimal(token_freq)

            differences = float(0)

            for file_id in text_stats['total'][str(token_length)+'-grams']:

                expected_percentage = text_stats['total'][str(token_length)+'-grams'][file_id]['relfreq']
                expected_percentage = abs(float(expected_percentage))
                observed_percentage = abs(float(freq_in_files.get(file_id, float(0))))

                diff = abs(expected_percentage - observed_percentage)
                differences += diff
            
            dp = Decimal(differences) / Decimal(2)
            
            # Before returning DP, let us insert it to the DB
            
            result = token_stats.update_one({'token': token},
                                    {'$set': {'dp': float(dp)}})

            return float(dp)

        except TypeError:
            # Token not found in DB
            return float(1)
        
        except KeyError:
            return float(1)

In [52]:
# If this operation fails,
# try dividing the MongoDB request more.
# For instance:
# first you run this cell for tokens with more than 1,000 occurrences
# all_tokens = token_stats.distinct('token', {'freq': {'$gt': 1000}})
# then you run this cell for tokens with less than 1,000 occurrences
# all_tokens = token_stats.distinct('token', {'freq': {'$lt': 1000})
# It will all depend on the size of your data.

In [53]:
all_tokens = token_stats.distinct('token', {'dp': {'$exists': False}})
tokens_processed = 0

print(datetime.now(), 'adding DP values for', len(all_tokens), 'tokens')
pool = ThreadPool(threads)
results = pool.map(calc_dp, tqdm_notebook(all_tokens))
pool.close()
print(datetime.now(), 'finished adding DP values for', len(all_tokens), 'tokens')

2018-09-13 08:58:53.014351 adding DP values for 4 tokens


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


2018-09-13 08:58:53.085787 finished adding DP values for 4 tokens
