# Processing text, extracting and visualising descriptive statistics

In [None]:
import numpy as np
import pandas as pd
import spacy, nltk
import matplotlib.pyplot as plt 
import seaborn


In [None]:
# TODO List 
# 1. tidy up preprocessing (get POS, ngram)
# 2. write a function to get concretisation score for each sentence/book. 

### 1. Preprocessing and Feature Engineering

In [None]:
def words_to_index(tokenslist):
    '''
    given a list of tokens, generate two indices (index2token, token2index) 
    input | tokenslist: list - 
    output | two dictionaries containing index2token, token2index respectively. 
    '''
    token_to_index = {}
    index_to_token = {}
    
    _tokens_unique = set([i.lower() for i in tokenslist])
    _tokens_unique = list(__tokens_unique) # return it to a set so that it is iterable
    # sort the dictionary by alphabet for easy searching
    # 
    for idx_number in range(len(_tokens_unique)):
        
        index_to_token[idx_number] =  _tokens_unique[idx_number]
        token_to_index[_tokens_unique[idx_number]] = idx_number
        
    return index_to_token, token_to_index

def generate_emptyarr(num_sentences, max_sentencelength, vocab_size):
    _array = np.zeros([num_sentences, max_sentencelength, vocab_size], dtype=int)
    return _array 

array = generate_emptyarr(num_sentences, max_sentencelength, vocab_size)


def generate_array(sentences, array, token_to_index): 
    '''
    
    '''
    for sent_num in range(len(sentences)):
        # send the sentence through spacy
        _ = nlp(sentences[sent_num])
        # filter out the punctuation, tokens retained are stored in str form (.text attribute on spacy tokens)
        _ = [token.text for token in __ if token.pos_ != "PUNCT"]
        # get the matrix for the sentence
        matrix = array[sent_num]

        for word_num in range(len(__)):
            # get the vector for the position of the word in the sentence
            vector = matrix[word_num] 
            # get the str form of the word. lowercase it to match the entries in the vocabulary dictionary                  
            word = _[word_num].lower()
            # find its index in the tok_to_index dictionary
            index = token_to_index[word]
            # go to that index on the vector and update it to 1
            vector[index] = 1
        
    return  array
        
def recover_text(results_array, index_to_token):
    # empty list to store recovered sentences 
    text = []
    
    # iterate through every sentence (1st dimension) of the array 
    for sentence in array: 
        # empty list to store the words recovered within this sentence 
        words = []
        for word in sentence: 
            # check using if-continue. only continue if the sum of the vector for the word is > 0 [i.e 
            # stop the iteration the moment an empty vector comes up. ]
            if sum(word) > 0:
                index_of_word = np.argmax(word)
                words.append(index_to_token[index_of_word])
                continue 
        text.append(words)
        
    return text        

In [None]:
wnl = nltk.WordNetLemmatizer()

def get_wordnet_pos(word_pos_tuple):
    """
    Helper function for text_preprocessor. Takes a tuple of (token, pos_tag) generated from running a tokenised 
    sentence through nltk.word_tokenize, and maps POS tag to the first character that nltk wordnetlemmatizer's 
    .lemmatize() method accepts
    source: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizerwithappropriatepostag 
    """
    tag = word_pos_tuple[1][0]
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag)

def text_preprocessor(text):
    '''
    Takes a text (a sentence, or a document) and preprocesses for the purposes of generating machine learning data from the 
    input. The preprocessing includes: (a) tokenisation, (b) removal of punctuation, (c) lemmatisation and lowercasing. 
    Returns a list of tokens from the input text. 
    '''
    _processed = []
    
    # tokenize the string
    _tokens = word_tokenize(text)
    # use nltk's pos_tag function to get the pos_tag for the string of tokens. 
    _tokens_postags = pos_tag(_tokens)

    
    for token_postag in _tokens_postags:  
        if token_postag[1] not in string.punctuation:
        # use get_wordnet_pos helper function to get the equivalent WordNetLemmatiser pos-tag
            wn_pos = get_wordnet_pos(token_postag)
            # WordNetLemmatiser only has tags for a, n, v, r. if-else to handle this. 
            if wn_pos != None: 
                _lemma = wnl.lemmatize(token_postag[0], wn_pos).lower()
            else:
                _lemma = token_postag[0].lower()
            _processed.append(_lemma)
    
    return __processed

def compute_vocabulary(text_lists):
    '''
    takes a list containing texts, (i) processes it and (ii) returns a pandas dataframe with the count of each word. 
    The processing involves: (a) tokenisation, (b) removal of punctuation, (c) lemmatisation and lowercasing. 
    '''
    _vocab_df = pd.DataFrame()
    counter = 0
    for text in text_lists: 
        _processed_text = text_preprocessor(text)
        _counter = collections.Counter(_processed_text)
        _text_df = pd.DataFrame([list(_counter.values())], columns = list(_counter.keys()))
        
        _vocab_df = pd.concat([_vocab_df, _text_df], sort=False, ignore_index=True)
        counter +=1
        if counter%250 ==0:
            print (counter)
    
    return _vocab_df 

In [None]:
# function to filter short sentences. 
def filter_df_shortsent(dataframe, min_tokens):
    _tokeep_index = [    ]
    [_tokeep_index.append(i) for i in dataframe.index if dataframe.loc[i].sum() >= min_tokens]
    return dataframe.loc[_tokeep_index]

# function to compute the frequency of all words in the dataset
def compute_vocab_freq(dataframe, top_n = 10, ascending=False):
    return dataframe.describe().loc['count'].sort_values(ascending=ascending)[0:top_n]
    


__Write a function that extracts the features, i.e. take all the books as input and return them as a list of lists (e.g. a list of lists of tokens)__

In [18]:
def tokenise_unigrams(texts_list):
    '''
    Takes a flat list (each containing strings - corresponding to a sentence or document), tokenises each sentence 
    using NLTK's word_tokenize and stores the results in a list. Does this for each sentence and then returns a 
    list of lists. 
    '''
    _tokenised_text_list = []
    for text in texts_list: 
        _tokens = word_tokenize(text)
        _tokenised_text_list.append(_tokens)

    return _tokenised_text_list

In [19]:
def lemmatise_unigrams(texts_list):
    '''
    Takes a text (a sentence, or a document) and preprocesses for the purposes of generating machine learning data from the 
    input. The preprocessing includes: (a) tokenisation, (b) removal of punctuation, (c) lemmatisation and lowercasing. 
    Returns a list of tokens from the input text. Calls on the text_preprocessor function. 
    '''
    _lemmatised_text_list = []
    
    for text in texts_list: 
        _tokens = text_preprocessor(text)
        _lemmatised_text_list.append(_tokens)
    
    return _lemmatised_text_list 

In [55]:
def make_ngrams (processed_texts_lists, n_gram=2, add_padding=False):
    '''
    Takes a list of lists (the lists contain pre-processed tokens) and produces n-gram tokens. Outputs a list of lists (in
    the same structure as the input). 
    '''
    # empty list to store the generated n-grams 
    ngrams_list = [] 
    
    
    # a for-loop just to iterate the number of times equal to the num of tokens in list 
    for processed_list in processed_texts_lists:
        _list = []
        counter = 0 
        for token in processed_list:
            # try-except to handle IndexErrors 
            try:
                ngram = " ".join([processed_list[0+i] for i in range(counter,counter+n_gram)])

            except IndexError: 
                if add_padding==False: 
                    break
                elif add_padding==True: 
                    # grab the remaining words that have not had n-grams generated from each of their positions
                    remaining_words = [processed_list[-1-i] for i in range(len(processed_list)-counter)]
                    # reverse the list since it was adding from the end of the previous 
                    remaining_words.reverse()
                    # end "<END>" tokens to pad the remaining spaces in the n-gram
                    ngram = " ".join(remaining_words + ["<END>"]*(n_gram-len(remaining_words)))

            _list.append(ngram)
            counter+=1
        ngrams_list.append(_list)
    return ngrams_list

In [60]:
make_ngrams(lemmatise_unigrams([test1,test2]),n_gram=2, add_padding=False)

[['the year',
  'year shall',
  'shall run',
  'run like',
  'like rabbit',
  'rabbit rabbit',
  'rabbit rabbit'],
 ['the year', 'year shall', 'shall run', 'run like', 'like chicken']]

__Write a function that takes the list and return a list of dictionnaries {attribute:value} where you might have different options to compute the value (here, only raw frequency or relative frequency. That could be also done in an additional function i.e. normalizer).__

In [22]:
def compute_frequency (data_lists , add_normalisation = None):
    '''
    takes a list of lists (each comprising processed inputs for a particular sentence or document). 
    
    input | data:list - a list of lists. Each list inside contains either tokens, lemmas, bigrams, or trigrams. 
    
    add_normalisation = "tf_max" implements max tf normalisation 
    (see https://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html)
    add_normalisation = "relative_frequency"
    '''
    dictcount_lists = []
    
    def dictcount_maker(row):
        '''
        helper function to take a list of tokens (that comprise/make up a sentence) and generates a dictionary with
        the count of each token. 
        '''
        dictcount = {}
        for token in row:
            try: 
                dictcount[token] += 1
            except: 
                dictcount[token] = 1
        return dictcount
    
    if add_normalisation == None: 
        for row in data_lists:
            dictcount = dictcount_maker(row)
            dictcount_lists.append(dictcount)

            
    elif add_normalisation == "tf_max":
        for row in data_lists:
            dictcount = dictcount_maker(row)
            
            # get the max count within the dictionary  
            maxcount=max(dictcount.values())
            
            # dictionary comprehension to divide each count in the dictionary by the max count 
            # and apply a weight and "bias" to get the normalised frequency. 
            dictcount = {key:0.4+(1-0.4)*(count/maxcount) for key,count in dictcount.items()}
            
            dictcount_lists.append(dictcount)

            
    elif add_normalisation ==  "relative_frequency":
        for row in data_lists:
            dictcount = dictcount_maker(row)
            
            # get the sum of all frequency counts for each token. 
            totalcount=sum(dictcount.values())
            
            # dictionary comprehension to divide each count in the dictionary by the total count 
            # to get the relative frequency. 
            dictcount = {key:count/totalcount for key,count in dictcount.items()}
            
            dictcount_lists.append(dictcount)
    
    else: 
        print("The add_normalisation parameter chosen is not recognised. Please check.")
    
    return dictcount_lists


In [None]:
# to do: compare tf_max and tfidf implementation in sklearn to understand difference in assumptions and subsequent impact. 
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [62]:
compute_frequency(make_ngrams(lemmatise_unigrams([test1,test2,test3,test4]),n_gram=1),add_normalisation = "relative_frequency")

[{'the': 0.125,
  'year': 0.125,
  'shall': 0.125,
  'run': 0.125,
  'like': 0.125,
  'rabbit': 0.375},
 {'the': 0.16666666666666666,
  'year': 0.16666666666666666,
  'shall': 0.16666666666666666,
  'run': 0.16666666666666666,
  'like': 0.16666666666666666,
  'chicken': 0.16666666666666666},
 {'i': 0.125,
  'saw': 0.125,
  'her': 0.125,
  'sell': 0.125,
  'seashell': 0.125,
  'on': 0.125,
  'a': 0.125,
  'seashore': 0.125},
 {'i': 0.08333333333333333,
  'shall': 0.08333333333333333,
  'see': 0.16666666666666666,
  'her': 0.08333333333333333,
  'sell': 0.08333333333333333,
  'seashell': 0.08333333333333333,
  'on': 0.08333333333333333,
  'a': 0.08333333333333333,
  'seashore': 0.08333333333333333,
  'let': 0.08333333333333333,
  "'s": 0.08333333333333333}]

__Write the vectorizer: a function that compute the entire vocabulary, and return a list of Numpy Arrays of the size the vocabulary size and the value corresponding to the feature values.__

In [24]:
def vectoriser(dictcount_lists):
    '''
    
    '''
    # pseudocode: 
    # 1. iterate through the list of dictcounts. get all keys and add to a set (call this vocab). set because it is sorted 
    # and no repeated values
    # 2. iterate through all the list of distcounts again. create an empty np array of the same size as the vocab. get index 
    # of each key in the distcount in the vocab. add values of distcount to the respective entry in the nparray. 

    # question: what is the impact of the choice tf_max and relative_frequency? i.e. values in the np are no longer zeros and
    # and counts, but zeros and frequencies (at sentence/document level)
    
    vocabulary_set = set()
    for distcount in dictcount_lists:
        sentence_tokens = set(distcount.keys())
        vocabulary_set.update(sentence_tokens)
    
    # convert the vocab into a list so that we can use its index 
    vocabulary_list = list(vocabulary_set)
    
    # 
    vectorised_arrays = []
    for distcount in dictcount_lists:
        _array = np.zeros(len(vocabulary_list))
        for token in distcount: 
            index_in_vocab = vocabulary_list.index(token)
            _array[index_in_vocab] = distcount[token]
            
        vectorised_arrays.append(_array)
        
        
    return vectorised_arrays, vocabulary_list

In [68]:
a = compute_frequency(make_ngrams(lemmatise_unigrams([test1,test2]),n_gram=2, add_padding=False),add_normalisation = "relative_frequency")
vectoriser(a)

([array([0.14285714, 0.14285714, 0.14285714, 0.14285714, 0.28571429,
         0.14285714, 0.        ]), array([0.2, 0. , 0.2, 0.2, 0. , 0.2, 0.2])],
 ['year shall',
  'like rabbit',
  'run like',
  'shall run',
  'rabbit rabbit',
  'the year',
  'like chicken'])

__Filtering:__

Write a function that filters out function words

Write a function that filters out words appearing less than X times in the dataset

In [94]:
def filterfunction(vectorised_arrays, vocabulary_list, datamin_freq = 10):
    '''
    takes a list of vectorised arrays as well as its associated vocabulary list. checks the global count/frequency 
    (normalised or not), removes from all arrays the columns where the global count/frequency (normalised or not)
    is below the provided value. removes the same columns from the vocabulary list returns (i) a new list of arrays;
    and (ii) a new vocabulary list.
    '''
    # pseudocode
    # easiest way: place all vectorised_arrays into a pandas and sum values of each column. then filter out 
    # manual way: concat all np.arrays. slice by column. sum it out and delete if < certain value. remember to remove 
    # the corresponding word in the vocab list. 
    
    # since each np array is the same size and columns are all aligned (i.e. indexed to the vocab list), we can just 
    # sum all the np arrays, this will generate a 1D array with the sum on each of the columns
    total_freqs = sum(vectorised_arrays)
    
    # identify the columns for words that have counts less than datamin_freq
    to_delete = []
    for col, val in enumerate(total_freqs):
        if val < datamin_freq:
            to_delete.append(col)
    
    # remove the columns from each of the vectorised arrays 
    new_vectorised_arrays = []
    for vectorised_array in vectorised_arrays:
        new_vectorised_arrays.append(np.delete(vectorised_array, to_delete))
    
    # remove the columns from vocabulary list 
    new_vocabulary_list = np.delete(vocabulary_list, to_delete)
    
    return new_vectorised_arrays, new_vocabulary_list

In [96]:
test_lem_freq = compute_frequency(make_ngrams(lemmatise_unigrams([test1,test2,test3,test4]),n_gram=1),add_normalisation = "relative_frequency")
test_vect = vectoriser(test_lem_freq)
filterfunction(test_vect[0], test_vect[1],datamin_freq=0.2)

# there are np arrays with 0 values still because the global freq for that word is more than the datamin_freq set. 

[0, 2, 7, 9, 12]


([array([0.   , 0.   , 0.125, 0.125, 0.   , 0.   , 0.125, 0.125, 0.125,
         0.   , 0.   , 0.   , 0.375]),
  array([0.        , 0.        , 0.16666667, 0.16666667, 0.        ,
         0.        , 0.16666667, 0.16666667, 0.16666667, 0.        ,
         0.        , 0.        , 0.        ]),
  array([0.125, 0.125, 0.   , 0.   , 0.125, 0.125, 0.   , 0.   , 0.   ,
         0.125, 0.125, 0.125, 0.   ]),
  array([0.08333333, 0.08333333, 0.        , 0.        , 0.08333333,
         0.08333333, 0.08333333, 0.        , 0.        , 0.08333333,
         0.08333333, 0.08333333, 0.        ])],
 array(['her', 'a', 'run', 'year', 'on', 'i', 'shall', 'like', 'the',
        'seashell', 'sell', 'seashore', 'rabbit'], dtype='<U8'))

__Cross-features:__

Write a function that returns the cartesian product of the features, the values being the multiplication of the values of each feature, i.e.:
<f0:v0, f1:v1, f2:2, f3:v3> 
> Becomes (we also keep the original features):
<f0_f1:v0*v1, f0_f2:v0*v2, f0_f3:v0*v3, f1_f2:v1*v2, f1_f3:v1*v3, f2_f3:v02*v3>

In [107]:
# pair-wise interaction only:
def generate_interaction(vectorised_array, vocabulary_list): 
    '''
    
    '''    
    
    interaction_vals = []
    interaction_vocab = []
    for index in range(len(vectorised_array)):
        val1 = vectorised_array[index]
        val1_name = vocabulary_list[index]
        vals2 = np.delete(vectorised_array, index)
        vals2_names = vocabulary_list.copy()
        del vals2_names[index]
        
        interaction_names = [val1_name+"_"+vals2_names[i] for i in range(len(vals2_names))]
        interactions = [val1*vals2[i] for i in range(len(vals2))]
        
        interaction_vocab.extend(interaction_names)
        interaction_vals.extend(interactions)
        
    return interaction_vals, interaction_vocab

### 2. Visualisation and descriptive statistics