In [1]:
# Function to sentence-tokenise answers 

from nltk.tokenize import sent_tokenize
import pandas as pd

def sent_tokenise_answer(data, col_ind) :
    
    """ Function to sentence-tokenise answers. Return a list of lists that contain sentences as strings.
        
        data = name of the dataframe
        col_ind = index of the column that contains the texts to be sentence-tokenised
    """
    
    sents_collector = []
    
    for answer in data.iloc[:, col_ind] :
        
        #if no answer was provided, return empty string list
        if pd.isnull(answer) :
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            sents_collector.append(sent_tokenize(answer))
            
    return(sents_collector)


In [2]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_lemmas_df.csv", nrows=5)

In [3]:
# Get column index of questions
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))

In [4]:
idx_Q1     #42

42

In [5]:
# Function to word-tokenise sentences 

from nltk.tokenize import word_tokenize

def word_tokenise_answer(data, col_ind) :
    
    """ Function to word-tokenise answers' sentences. Return a list of lists of words as strings. 
        Required input, a list of lists containing sentences as strings.
        
        data = name of the dataframe
        col_ind = index of the column that contains the list of sentences to be word-tokenised
    """
    
    sents_collector = []
    
    for answer in data.iloc[:, col_ind] :
        
        #check
        #if not answer :
        #    print("answer is Null")
        
        #if no answer was provided, return empty string list
        if not answer:
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            
            words_collector = []
            
            for sent in answer :
                words_collector.append(word_tokenize(sent))
                
            sents_collector.append(words_collector)
            
    return(sents_collector)


In [6]:
[type(s) for s in sent_tokenise_answer(cons1_df, idx_Q1)[4]]

[str, str, str, str, str, str, str, str, str, str, str, str, str, str]

In [7]:
cons1_df.loc[:, 'test1'] = sent_tokenise_answer(cons1_df, idx_Q1)

In [8]:
test1_idx = cons1_df.columns.get_loc('test1')      #73

In [9]:
cons1_df.loc[:, 'test1']

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [10]:
cons1_df.iloc[:, test1_idx]

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [11]:
print(cons1_df.iloc[:, test1_idx]) 
print([type(a) for a in cons1_df.iloc[:, test1_idx]])
print(word_tokenise_answer(cons1_df, test1_idx))

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object
[<class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>]
[[], [], [], [['Moving', 'to', 'a', 'primarily', 'online', 'census', ':', 'an', 'inevitable', 'and', 'necessary', 'evolution', 'of', 'the', 'existing', 'approach', '.'], ['Admin', 'data', 'and', 'surveys', ':', 'an', 'unknown', 'quantity', ',', 'dependent', 'on', 'the', 'quality', 'of', 'admin', 'data', ',', 'and', 'not', 'clear', 'how', 'well', 'it', 'would', 'fulfil', 'the', 'primary', 'aim', 'of', 'a', 'census', ':', 'to', 'produce', 'an', 'accurate', 'and', 'independent', 'estimate', 'of', 'the', 'size', 'and', 'composition', 'of', 'the', 'population', '.']], [['A', 'regular', 'full', 'population', 'census

In [12]:
cons1_df.loc[:, 'test2'] = word_tokenise_answer(cons1_df, test1_idx)

In [13]:
test2_idx = cons1_df.columns.get_loc('test2')      #74

In [14]:
[print(w) for w in cons1_df.iloc[:, test2_idx]]

[]
[]
[]
[['Moving', 'to', 'a', 'primarily', 'online', 'census', ':', 'an', 'inevitable', 'and', 'necessary', 'evolution', 'of', 'the', 'existing', 'approach', '.'], ['Admin', 'data', 'and', 'surveys', ':', 'an', 'unknown', 'quantity', ',', 'dependent', 'on', 'the', 'quality', 'of', 'admin', 'data', ',', 'and', 'not', 'clear', 'how', 'well', 'it', 'would', 'fulfil', 'the', 'primary', 'aim', 'of', 'a', 'census', ':', 'to', 'produce', 'an', 'accurate', 'and', 'independent', 'estimate', 'of', 'the', 'size', 'and', 'composition', 'of', 'the', 'population', '.']]
[['A', 'regular', 'full', 'population', 'census', 'is', 'absolutely', 'necessary', '.'], ['It', 'is', 'the', 'only', 'way', 'to', 'ensure', 'that', 'all', 'population', 'and', 'social', 'statistical', 'estimates', 'and', 'projections', 'are', 'grounded', 'in', 'reality', '.'], ['Data', 'for', 'small', 'areas', 'is', 'absolutely', 'necessary', '.'], ['Without', 'it', ',', 'local', 'authorities', 'would', 'be', 'unable', 'to', 'plan'

[None, None, None, None, None]

In [15]:
# Define function to calculate polarity score for the answers in our dataset

# import key modules
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from numpy import nan
    

def get_sentiment_score(data, col_ind, score_type = 'compound') :
    """ 
    
    Calculate sentiment analysis score (score_type: 'compound' default, 'pos', 'neg')
    for the values in the specified dataframe column.
    
    Return a list of scores, one score for each sentence in the column cell.
    
    """
    
    # empty list collector of scores
    sentiment_bag = []
    
    for answer in data.iloc[:, col_ind] :
        
        #print(len(answer))
        
        # no answer was provided, return NA
        if not answer : 
            sentiment_bag.append(nan)
        
        # answer is made of only 1 sentence    
        elif len(answer) == 1 :
            sentiment_bag.append(analyser.polarity_scores(answer)[score_type])
        
        # answer contains more than one sentence
        elif len(answer) > 1 :
            sentiment_bag.append([analyser.polarity_scores(s)[score_type] for s in answer])
    
    return(sentiment_bag)
    



In [16]:
# test
get_sentiment_score(cons1_df, test1_idx, 'compound')

[nan,
 nan,
 nan,
 [0.0, -0.4585],
 [0.0,
  0.3818,
  0.0,
  0.4404,
  0.0,
  0.0,
  0.4404,
  0.0,
  0.8481,
  0.7964,
  -0.1779,
  0.0,
  0.4404,
  0.4404]]

In [36]:
# answer after sent-tokenisation, before word-tokenisation
[print(list(s)) for s in cons1_df.iloc[3:, test1_idx]]

['Moving to a primarily online census: an inevitable and necessary evolution of the existing approach.', 'Admin data and surveys: an unknown quantity, dependent on the quality of admin data, and not clear how well it would fulfil the primary aim of a census: to produce an accurate and independent estimate of the size and composition of the population.']
['A regular full population census is absolutely necessary.', 'It is the only way to ensure that all population and social statistical estimates and projections are grounded in reality.', 'Data for small areas is absolutely necessary.', 'Without it, local authorities would be unable to plan services, target resources and measure performance effectively.', 'The cost to the country would exceed the cost of a decennial census.', 'Would suggest that both options should be carried out, if possible, rather than one or the other.', 'That way, we get the accurate picture every ten years, and a good/ useful indication of trends throughout the in

[None, None]

In [121]:
[print(str(s)) for s in sent_tokenise_answer(cons1_df, idx_Q1)]

[]
[]
[]
['Moving to a primarily online census: an inevitable and necessary evolution of the existing approach.', 'Admin data and surveys: an unknown quantity, dependent on the quality of admin data, and not clear how well it would fulfil the primary aim of a census: to produce an accurate and independent estimate of the size and composition of the population.']
['A regular full population census is absolutely necessary.', 'It is the only way to ensure that all population and social statistical estimates and projections are grounded in reality.', 'Data for small areas is absolutely necessary.', 'Without it, local authorities would be unable to plan services, target resources and measure performance effectively.', 'The cost to the country would exceed the cost of a decennial census.', 'Would suggest that both options should be carried out, if possible, rather than one or the other.', 'That way, we get the accurate picture every ten years, and a good/ useful indication of trends througho

[None, None, None, None, None]

In [30]:
[print(result) for result in get_sentiment_score(cons1_df, test1_idx)]

nan
nan
nan
[0.0, -0.4585]
[0.0, 0.3818, 0.0, 0.4404, 0.0, 0.0, 0.4404, 0.0, 0.8481, 0.7964, -0.1779, 0.0, 0.4404, 0.4404]


[None, None, None, None, None]

In [37]:
import numpy as np
print([mean(np.array(result)) for result in get_sentiment_score(cons1_df, test1_idx)])
print([median(np.array(result)) for result in get_sentiment_score(cons1_df, test1_idx)])

[nan, nan, nan, -0.22925000000000001, 0.25785714285714284]
[nan, nan, nan, -0.22925000000000001, 0.19089999999999999]


  r = func(a, **kwargs)


In [86]:
[print(w) for w in cons1_df.iloc[:, test2_idx]];
[print(len(w)) for w in cons1_df.iloc[:, test2_idx]]    # count number of senences in each answer
cons1_df.iloc[:, test2_idx].apply(len)

[]
[]
[]
[['Moving', 'to', 'a', 'primarily', 'online', 'census', ':', 'an', 'inevitable', 'and', 'necessary', 'evolution', 'of', 'the', 'existing', 'approach', '.'], ['Admin', 'data', 'and', 'surveys', ':', 'an', 'unknown', 'quantity', ',', 'dependent', 'on', 'the', 'quality', 'of', 'admin', 'data', ',', 'and', 'not', 'clear', 'how', 'well', 'it', 'would', 'fulfil', 'the', 'primary', 'aim', 'of', 'a', 'census', ':', 'to', 'produce', 'an', 'accurate', 'and', 'independent', 'estimate', 'of', 'the', 'size', 'and', 'composition', 'of', 'the', 'population', '.']]
[['A', 'regular', 'full', 'population', 'census', 'is', 'absolutely', 'necessary', '.'], ['It', 'is', 'the', 'only', 'way', 'to', 'ensure', 'that', 'all', 'population', 'and', 'social', 'statistical', 'estimates', 'and', 'projections', 'are', 'grounded', 'in', 'reality', '.'], ['Data', 'for', 'small', 'areas', 'is', 'absolutely', 'necessary', '.'], ['Without', 'it', ',', 'local', 'authorities', 'would', 'be', 'unable', 'to', 'plan'

0     0
1     0
2     0
3     2
4    14
Name: test2, dtype: int64

In [120]:
# Define function to ....

import string

def break_words(answer_col, compound_symbol = '-') :
    """
    Break words that are of the compound form word1<symbol>word2 into the constituting words, then remove empty strings. 
    
    Default compunding symbol = '-'
    
    """
    
    # empty list collector
    tokens_bag = []
    
    for answer in answer_col :   
        
        # no answer was provided, return empty string
        if not answer : 
            tokens_bag.append("")
            
        # an answer was provided       
        else :
            
            print('No. of sentence in the answer = ' + str(len(answer)))
            
            
            # empty collector to make sure we keep the sentences within an answer in separate lists
            words_in_s = []
            
            for sent in answer :
                
                # empty collector for words within one sentence
                words = []
                
                # 1. break words that are of the form word1-word2 into constituting words
            
                for w in sent :
                
                    if compound_symbol in w :
                    
                        words.extend(w.split(compound_symbol))
                    
                    else :
                    
                        words.append(w)
                    
                    # 2. Remove empty strings
                    words = list(filter(None, words))
                    
                words_in_s.append(words)

            tokens_bag.append(words_in_s)
    
    return(tokens_bag)

In [87]:
cons1_df.iloc[:, test2_idx].apply(len)

0     0
1     0
2     0
3     2
4    14
Name: test2, dtype: int64

In [123]:
break_words(cons1_df.iloc[:, test2_idx])

No. of sentence in the answer = 2
No. of sentence in the answer = 14


['',
 '',
 '',
 [['Moving',
   'to',
   'a',
   'primarily',
   'online',
   'census',
   ':',
   'an',
   'inevitable',
   'and',
   'necessary',
   'evolution',
   'of',
   'the',
   'existing',
   'approach',
   '.'],
  ['Admin',
   'data',
   'and',
   'surveys',
   ':',
   'an',
   'unknown',
   'quantity',
   ',',
   'dependent',
   'on',
   'the',
   'quality',
   'of',
   'admin',
   'data',
   ',',
   'and',
   'not',
   'clear',
   'how',
   'well',
   'it',
   'would',
   'fulfil',
   'the',
   'primary',
   'aim',
   'of',
   'a',
   'census',
   ':',
   'to',
   'produce',
   'an',
   'accurate',
   'and',
   'independent',
   'estimate',
   'of',
   'the',
   'size',
   'and',
   'composition',
   'of',
   'the',
   'population',
   '.']],
 [['A',
   'regular',
   'full',
   'population',
   'census',
   'is',
   'absolutely',
   'necessary',
   '.'],
  ['It',
   'is',
   'the',
   'only',
   'way',
   'to',
   'ensure',
   'that',
   'all',
   'population',
   'and',
   

In [67]:
text = pd.Series(["", 'I love double-cream ice-cream', 'I hate fudge. But, I like caramel', 'Pseudo-science. I -care'])

In [68]:
dummy_df = pd.DataFrame(text)
dummy_df.columns = ['text']
dummy_df

Unnamed: 0,text
0,
1,I love double-cream ice-cream
2,"I hate fudge. But, I like caramel"
3,Pseudo-science. I -care


In [96]:
print(dummy_df['text'])
dummy_df['text1'] = sent_tokenise_answer(dummy_df, 0)
dummy_df['text2'] = word_tokenise_answer(dummy_df, 1)

0                                     
1        I love double-cream ice-cream
2    I hate fudge. But, I like caramel
3              Pseudo-science. I -care
Name: text, dtype: object


In [106]:
break_words(cons1_df['test2']);

No. of sentence in the answer = 2
No. of sentence in the answer = 14


In [122]:
print(dummy_df['text2'])
break_words(dummy_df['text2'])

0                                                   []
1                 [[I, love, double-cream, ice-cream]]
2    [[I, hate, fudge, .], [But, ,, I, like, caramel]]
3                    [[Pseudo-science, .], [I, -care]]
Name: text2, dtype: object
No. of sentence in the answer = 1
No. of sentence in the answer = 2
No. of sentence in the answer = 2


['',
 [['I', 'love', 'double', 'cream', 'ice', 'cream']],
 [['I', 'hate', 'fudge', '.'], ['But', ',', 'I', 'like', 'caramel']],
 [['Pseudo', 'science', '.'], ['I', 'care']]]