In [174]:
# Function to sentence-tokenise answers 

from nltk.tokenize import sent_tokenize
import pandas as pd

def sent_tokenise_answer(answer_col) :
    
    """ Function to sentence-tokenise answers. Return a list of lists that contain sentences as strings.
        
        data = name of the dataframe
        col_ind = index of the column that contains the texts to be sentence-tokenised
    """
    
    sents_collector = []
    
    for answer in answer_col :
        
        # no answer was provided -> return empty string list
        if pd.isnull(answer) :
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            sents_collector.append(sent_tokenize(answer))
            
    return(sents_collector)


In [2]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_lemmas_df.csv", nrows=5)

In [3]:
# Get column index of questions
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))

In [4]:
idx_Q1     #42

42

In [175]:
# Function to word-tokenise sentences 

from nltk.tokenize import word_tokenize

def word_tokenise_answer(answer_col) :
    
    """ 
    Function to word-tokenise answers' sentences. 
    Return a list of lists of lower-case words as strings. 
    Required input, a list of lists containing sentences as strings.
        
    answer_col = name of the dataframe column that contains the list of sentences to be word-tokenised
    """
    
    sents_collector = []
    
    for answer in answer_col :  
        
        # no answer was provided -> return empty string list
        if not answer:
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            
            words_collector = []
            
            for sent in answer :
                
                # 1. word-tokenise the answer
                words = word_tokenize(sent)
                
                # 2. convert to lower case
                words = [w.lower() for w in words]
                
                words_collector.append(words)
                
            sents_collector.append(words_collector)
            
    return(sents_collector)


In [6]:
[type(s) for s in sent_tokenise_answer(cons1_df, idx_Q1)[4]]

[str, str, str, str, str, str, str, str, str, str, str, str, str, str]

In [183]:
cons1_df.loc[:, 'test1'] = sent_tokenise_answer(cons1_df.iloc[:,idx_Q1])

In [184]:
test1_idx = cons1_df.columns.get_loc('test1')      #73

In [185]:
cons1_df.loc[:, 'test1']

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [186]:
cons1_df.iloc[:, test1_idx]

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [11]:
print(cons1_df.iloc[:, test1_idx]) 
print([type(a) for a in cons1_df.iloc[:, test1_idx]])
print(word_tokenise_answer(cons1_df, test1_idx))

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object
[<class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>]
[[], [], [], [['Moving', 'to', 'a', 'primarily', 'online', 'census', ':', 'an', 'inevitable', 'and', 'necessary', 'evolution', 'of', 'the', 'existing', 'approach', '.'], ['Admin', 'data', 'and', 'surveys', ':', 'an', 'unknown', 'quantity', ',', 'dependent', 'on', 'the', 'quality', 'of', 'admin', 'data', ',', 'and', 'not', 'clear', 'how', 'well', 'it', 'would', 'fulfil', 'the', 'primary', 'aim', 'of', 'a', 'census', ':', 'to', 'produce', 'an', 'accurate', 'and', 'independent', 'estimate', 'of', 'the', 'size', 'and', 'composition', 'of', 'the', 'population', '.']], [['A', 'regular', 'full', 'population', 'census

In [187]:
cons1_df.loc[:, 'test2'] = word_tokenise_answer(cons1_df.iloc[:, test1_idx])

In [188]:
test2_idx = cons1_df.columns.get_loc('test2')      #74

In [128]:
#[print(w) for w in cons1_df.iloc[:, test2_idx]]

In [176]:
# Define function to calculate polarity score for the answers in our dataset

# import key modules
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from numpy import nan
    

def get_sentiment_score(answer_col, score_type = 'compound') :
    """ 
    
    Calculate sentiment analysis score (score_type: 'compound' default, 'pos', 'neg')
    for the values in the specified dataframe column.
    
    Return a list of scores, one score for each sentence in the column cell.
    
    """
    
    # empty list collector of scores
    sentiment_bag = []
    
    for answer in answer_col : 
        
        #print(len(answer))
        
        # no answer was provided, return NA
        if not answer : 
            sentiment_bag.append(nan)
        
        # answer is made of only 1 sentence    
        elif len(answer) == 1 :
            sentiment_bag.append(analyser.polarity_scores(answer)[score_type])
        
        # answer contains more than one sentence
        elif len(answer) > 1 :
            sentiment_bag.append([analyser.polarity_scores(s)[score_type] for s in answer])
    
    return(sentiment_bag)
    

In [189]:
# test
get_sentiment_score(cons1_df.iloc[:, test1_idx], 'compound')

[nan,
 nan,
 nan,
 [0.0, -0.4585],
 [0.0,
  0.3818,
  0.0,
  0.4404,
  0.0,
  0.0,
  0.4404,
  0.0,
  0.8481,
  0.7964,
  -0.1779,
  0.0,
  0.4404,
  0.4404]]

In [126]:
# answer after sent-tokenisation, before word-tokenisation
#[print(list(s)) for s in cons1_df.iloc[3:, test1_idx]]

In [127]:
#[print(str(s)) for s in sent_tokenise_answer(cons1_df, idx_Q1)]

In [190]:
[print(result) for result in get_sentiment_score(cons1_df.iloc[:, test1_idx])]

nan
nan
nan
[0.0, -0.4585]
[0.0, 0.3818, 0.0, 0.4404, 0.0, 0.0, 0.4404, 0.0, 0.8481, 0.7964, -0.1779, 0.0, 0.4404, 0.4404]


[None, None, None, None, None]

In [192]:
import numpy as np
print([np.mean(np.array(result)) for result in get_sentiment_score(cons1_df.iloc[:,test1_idx])])
print([np.median(np.array(result)) for result in get_sentiment_score(cons1_df.iloc[:,test1_idx])])

[nan, nan, nan, -0.22925000000000001, 0.25785714285714284]
[nan, nan, nan, -0.22925000000000001, 0.19089999999999999]


  r = func(a, **kwargs)


In [193]:
#[print(w) for w in cons1_df.iloc[:, test2_idx]];
[print(len(w)) for w in cons1_df.iloc[:, test2_idx]]    # count number of senences in each answer
cons1_df.iloc[:, test2_idx].apply(len)

0
0
0
2
14


0     0
1     0
2     0
3     2
4    14
Name: test2, dtype: int64

In [120]:
# Define function to ....

import string

def break_words(answer_col, compound_symbol = '-') :
    """
    Break words that are of the compound form word1<symbol>word2 into the constituting words, 
    then remove empty strings. 
    
    Default compund symbol = '-'
    
    """
    
    # empty list collector
    tokens_bag = []
    
    for answer in answer_col :   
        
        # no answer was provided, return empty string
        if not answer : 
            tokens_bag.append("")
            
        # an answer was provided       
        else :
            
            print('No. of sentence in the answer = ' + str(len(answer)))
            
            
            # empty collector to make sure we keep the sentences within an answer as separate lists
            words_in_s = []
            
            for sent in answer :
                
                # empty collector for words within one sentence
                words = []
                
                # 1. break words that are of the form word1-word2 into constituting words
            
                for w in sent :
                
                    if compound_symbol in w :
                    
                        words.extend(w.split(compound_symbol))
                    
                    else :
                    
                        words.append(w)
                    
                    # 2. Remove empty strings
                    words = list(filter(None, words))
                    
                words_in_s.append(words)

            tokens_bag.append(words_in_s)
    
    return(tokens_bag)

In [194]:
cons1_df.iloc[:, test2_idx].apply(len)

0     0
1     0
2     0
3     2
4    14
Name: test2, dtype: int64

In [195]:
break_words(cons1_df.iloc[:, test2_idx]);

No. of sentence in the answer = 2
No. of sentence in the answer = 14


In [196]:
text = pd.Series(["", 'I love double-cream ice-cream', 'I hate fudge. But, I like caramel', 'Pseudo-science. I -care'])

In [197]:
dummy_df = pd.DataFrame(text)
dummy_df.columns = ['text']
dummy_df

Unnamed: 0,text
0,
1,I love double-cream ice-cream
2,"I hate fudge. But, I like caramel"
3,Pseudo-science. I -care


In [198]:
print(dummy_df['text'])
dummy_df['text1'] = sent_tokenise_answer(dummy_df.iloc[:, 0])
dummy_df['text2'] = word_tokenise_answer(dummy_df.iloc[:, 1])

0                                     
1        I love double-cream ice-cream
2    I hate fudge. But, I like caramel
3              Pseudo-science. I -care
Name: text, dtype: object


In [200]:
dummy_df['text2'] = break_words(dummy_df['text2'])

No. of sentence in the answer = 1
No. of sentence in the answer = 2
No. of sentence in the answer = 2


In [202]:
print(dummy_df['text1'])
dummy_df['text2']

0                                      []
1         [I love double-cream ice-cream]
2    [I hate fudge., But, I like caramel]
3              [Pseudo-science., I -care]
Name: text1, dtype: object


0                                                     
1               [[i, love, double, cream, ice, cream]]
2    [[i, hate, fudge, .], [but, ,, i, like, caramel]]
3                    [[pseudo, science, .], [i, care]]
Name: text2, dtype: object

In [222]:
# Define functions to replace contracted negative forms of auxiliary verbs with negation, remove specified stop-words, 

import string
from nltk.corpus import stopwords


def fix_neg_aux(answer_col) :
    """
    Replace contracted negative forms of auxiliary verbs with negation (if True).
    
    Parameters:
    - answer_col = dataframe column whose cells contain answer texts
    """
    
    # empty list collector for all answers
    tokens_bag = []
             
    for answer in answer_col :   
        
        
        if not answer :             # no answer was provided, return empty string
            tokens_bag.append("")
            
              
        else :                      # an answer was provided 
            
            print('No. of sentence in the answer = ' + str(len(answer)))
            
            # empty collector for individual senences within an asnwer
            sep_sents = []
            
            for sent in answer :
                
                for w in sent :
                        
                    if w in ["don't", "didn", "didn't", "doesn", "doesn't", 'hadn', 
                             "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', 
                             "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 
                             'needn', "needn't", "shan't", 'shouldn', "shouldn't", 
                             'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
                             'wouldn', "wouldn't", 'aren', "aren't", 'couldn', "couldn't"] :
                            
                        w = 'not'
                        
                    else :
                        
                        w = w
                        
                # collect each sentence as a (separate) list of words
                sep_sents.append(new_sent)
                
            tokens_bag.append(sep_sents)
            
    return(tokens_bag)
           

In [223]:
# Define functions to replace contracted negative forms of auxiliary verbs with negation, remove specified stop-words, 

import string
from nltk.corpus import stopwords


def remove_stopwords(answer_col, stopwords_list=stopwords.words('english'), keep_neg = True) :
    """
    Remove specified stop-words.
    
    Parameters:
    - answer_col = dataframe column whose cells contain answer texts
    - keep_neg = whether to remove negation from list of stopwords, (default) True
    - stopwords_list = (default) English stopwords from. nltk.corpus
    """
    
    # empty list collector for all answers
    tokens_bag = []
    
    
    if keep_neg :       # keep negations in
        
        stopwords_list = [w for w in stopwords_list if not w in ['no', 'nor', 'not', 'only', 
                                                                 'up', 'down', 'further', 
                                                                 'too', 'against']]
             
            
    for answer in answer_col :   
        
        
        if not answer :             # no answer was provided, return empty string
            tokens_bag.append("")
            
              
        else :                      # an answer was provided 
            
            print('No. of sentence in the answer = ' + str(len(answer)))
            
            # empty collector for individual senences within an asnwer
            sep_sents = []
            
            for sent in answer :
                
                # filter out stop words from each answer
                new_sent = [w for w in sent if not w in stopwords_list]
                
                # collect each sentence as a (separate) list of words
                sep_sents.append(new_sent)
                
            tokens_bag.append(sep_sents)
            
    return(tokens_bag)
           

In [204]:
print(dummy_df['text2'])
dummy_df['text3'] = remove_stopwords(dummy_df['text2'], stopwords_list=stopwords.words('english'))
print(dummy_df['text3'])

0                                                     
1               [[i, love, double, cream, ice, cream]]
2    [[i, hate, fudge, .], [but, ,, i, like, caramel]]
3                    [[pseudo, science, .], [i, care]]
Name: text2, dtype: object
No. of sentence in the answer = 1
No. of sentence in the answer = 2
No. of sentence in the answer = 2
0                                          
1       [[love, double, cream, ice, cream]]
2    [[hate, fudge, .], [,, like, caramel]]
3            [[pseudo, science, .], [care]]
Name: text3, dtype: object


In [205]:
dummy_df.columns

Index(['text', 'text1', 'text2', 'text3'], dtype='object')

In [207]:
# Function to part-of-speech tagging sentences

from nltk import pos_tag


def POS_tagging(answer_col) :
    
    """
    Return a list with POS-tags/words tuples for the specified data column.
    
    Parameters:
    - answer_col = dataframe columns containing answer texts, as lists (answers) 
        of lists (sentences) of tokenised words
    
    """
    
    # empty list collector
    tokens_bag = []
    
    for answer in answer_col :   
        
        # no answer was provided, return empty string
        if not answer : 
            tokens_bag.append("")
            
        # an answer was provided       
        else :
            
            # empty collector for individual senences within an asnwer
            sep_sents = []
            
            for sent in answer :
                
                # calculate Part-Of-Speech
                pos_answer = pos_tag(sent)
                
                sep_sents.append(pos_answer)
                
            
            tokens_bag.append(sep_sents)
            
    return tokens_bag
                
            

In [225]:
print(dummy_df['text3'])
print(POS_tagging(dummy_df['text3']))
print(POS_tagging(dummy_df['text2']))
#print(dummy_df['text4'])

0                                          
1       [[love, double, cream, ice, cream]]
2    [[hate, fudge, .], [,, like, caramel]]
3            [[pseudo, science, .], [care]]
Name: text3, dtype: object
['', [[('love', 'NN'), ('double', 'JJ'), ('cream', 'NN'), ('ice', 'NN'), ('cream', 'NN')]], [[('hate', 'NN'), ('fudge', 'NN'), ('.', '.')], [(',', ','), ('like', 'IN'), ('caramel', 'NN')]], [[('pseudo', 'NN'), ('science', 'NN'), ('.', '.')], [('care', 'NN')]]]
['', [[('i', 'NN'), ('love', 'VBP'), ('double', 'JJ'), ('cream', 'NN'), ('ice', 'NN'), ('cream', 'NN')]], [[('i', 'JJ'), ('hate', 'NN'), ('fudge', 'NN'), ('.', '.')], [('but', 'CC'), (',', ','), ('i', 'VBP'), ('like', 'IN'), ('caramel', 'NN')]], [[('pseudo', 'NN'), ('science', 'NN'), ('.', '.')], [('i', 'NN'), ('care', 'NN')]]]


In [219]:
POS_tagging([[["I", 'not like', 'fudge'], ['I', "don't", 'like', 'fudge'], ['I', 'not', 'like', 'fudge']]])

[[[('I', 'PRP'), ('not like', 'VBP'), ('fudge', 'NN')],
  [('I', 'PRP'), ("don't", 'VBP'), ('like', 'IN'), ('fudge', 'NN')],
  [('I', 'PRP'), ('not', 'RB'), ('like', 'IN'), ('fudge', 'NN')]]]

In [220]:
POS_tagging([[["I", 'not buy', 'fudge'], ['I', "don't", 'buy', 'fudge'], ['I', 'not', 'buy', 'fudge']]])

[[[('I', 'PRP'), ('not buy', 'VBP'), ('fudge', 'NN')],
  [('I', 'PRP'), ("don't", 'VBP'), ('buy', 'VB'), ('fudge', 'NN')],
  [('I', 'PRP'), ('not', 'RB'), ('buy', 'VB'), ('fudge', 'NN')]]]

In [226]:
# TBC : should impement something like this...
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

# The following function would map the Peen Treebank tags to WordNet part of speech names:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    """
    Return Wordnet POS tags from Penn Treebank tags
    """
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return ''

In [None]:
# Function 

# import get_wordnet_pos ?

def from_treebank_to_wordnet(POStag_col) :
    
    # collector for all 
    lemma_big_bag = []
    
    
    for cell in POStag_col :
        
        lemma_bag = []
        
        #print(answer)
        #print(len(answer))
        
        # an answer was provided
        if len(cell) > 0 :
            
            for POStext_pair in cell :
                
                #print(POStext_pair[0])
                #print(POStext_pair[1])
                
                #print(type(POStext_pair[0]))
                #print(type(POStext_pair[1]))
                
                #print('wordnet pos = ' + get_wordnet_pos(POStext_pair[1]))
                #print('type wordnet pos = ' + str(type(get_wordnet_pos(POStext_pair[1]))))
                
                #print( get_wordnet_pos(POStext_pair[1]) == '')
                
                
                # the treebank POS does not have a wordnet POS equivalent
                if get_wordnet_pos(POStext_pair[1]) == '' :
                    
                    lemma = POStext_pair[0]
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                
                # the treebank POS does have a wordnet POS equivalent
                else :
                    
                    lemma = wordnet_lemmatiser.lemmatize(POStext_pair[0], pos=get_wordnet_pos(POStext_pair[1]))
                
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                lemma_bag.append(lemma)
                #print(lemma_bag)
                #print(type(lemma_bag))
                
        else :
            
            lemma_bag.append(str(""))
        
        
        lemma_big_bag.append(lemma_bag)

