In [118]:
# Function to sentence-tokenise answers 

from nltk.tokenize import sent_tokenize
import pandas as pd

def sent_tokenise_answer(data, col_ind) :
    
    """ Function to sentence-tokenise answers. Return a list of lists that contain sentences as strings.
        
        data = name of the dataframe
        col_ind = index of the column that contains the texts to be sentence-tokenised
    """
    
    sents_collector = []
    
    for answer in data.iloc[:, col_ind] :
        
        #if no answer was provided, return empty string list
        if pd.isnull(answer) :
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            sents_collector.append(sent_tokenize(answer))
            
    return(sents_collector)


In [9]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_lemmas_df.csv", nrows=5)

In [3]:
# Get column index of questions
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))

In [115]:
# Function to word-tokenise sentences 

from nltk.tokenize import word_tokenize

def word_tokenise_answer(data, col_ind) :
    
    """ Function to word-tokenise answers' sentences. Return a list of lists of words as strings. 
        Required input, a list of lists containing sentences as strings.
        
        data = name of the dataframe
        col_ind = index of the column that contains the list of sentences to be word-tokenised
    """
    
    sents_collector = []
    
    for answer in data.iloc[:, col_ind] :
        
        words_collector = []
        
        #if no answer was provided, return empty string list
        if not answer :
            sents_collector.append(list(""))
            
        # an answer was provided    
        else :
            for sent in answer :
                words_collector.append(word_tokenize(sent))
                
        sents_collector.append(words_collector)
            
    return(sents_collector)


In [111]:
[type(s) for s in sent_tokenise_answer(cons1_df, idx_Q1)[4]]

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


[str, str, str, str, str, str, str, str, str, str, str, str, str, str]

In [23]:
cons1_df.loc[:, 'test1'] = sent_tokenise_answer(cons1_df, idx_Q1)

In [28]:
cons1_df.loc[:, 'test1']

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [38]:
cons1_df.iloc[:, 73]

0                                                   []
1                                                   []
2                                                   []
3    [Moving to a primarily online census: an inevi...
4    [A regular full population census is absolutel...
Name: test1, dtype: object

In [54]:
word_tokenise_answer(cons1_df, 73)

[[],
 [],
 [],
 [],
 [],
 [],
 [['Moving',
   'to',
   'a',
   'primarily',
   'online',
   'census',
   ':',
   'an',
   'inevitable',
   'and',
   'necessary',
   'evolution',
   'of',
   'the',
   'existing',
   'approach',
   '.'],
  ['Admin',
   'data',
   'and',
   'surveys',
   ':',
   'an',
   'unknown',
   'quantity',
   ',',
   'dependent',
   'on',
   'the',
   'quality',
   'of',
   'admin',
   'data',
   ',',
   'and',
   'not',
   'clear',
   'how',
   'well',
   'it',
   'would',
   'fulfil',
   'the',
   'primary',
   'aim',
   'of',
   'a',
   'census',
   ':',
   'to',
   'produce',
   'an',
   'accurate',
   'and',
   'independent',
   'estimate',
   'of',
   'the',
   'size',
   'and',
   'composition',
   'of',
   'the',
   'population',
   '.']],
 [['A',
   'regular',
   'full',
   'population',
   'census',
   'is',
   'absolutely',
   'necessary',
   '.'],
  ['It',
   'is',
   'the',
   'only',
   'way',
   'to',
   'ensure',
   'that',
   'all',
   'population'

In [55]:
# Define function to calculate polarity score for the answers in our dataset

# import key modules
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
    

def get_sentiment_score(data, col_ind) :
    """ Return list of polarity scores for values in the specified dataframe column """
    
    # empty list collector of scores
    sentiment_bag = []
    
    for answer in data.iloc[:, col_ind] :
        
        # no answer was provided, return NA
        if pd.isnull(answer) : 
            sentiment_bag.append(np.nan)
            
        else :
            sentiment_bag.append(analyser.polarity_scores(answer)['compound'])
    
    return(sentiment_bag)
    



In [86]:
[print(list(s)) for s in cons1_df.iloc[3:, 73]]

['Moving to a primarily online census: an inevitable and necessary evolution of the existing approach.', 'Admin data and surveys: an unknown quantity, dependent on the quality of admin data, and not clear how well it would fulfil the primary aim of a census: to produce an accurate and independent estimate of the size and composition of the population.']
['A regular full population census is absolutely necessary.', 'It is the only way to ensure that all population and social statistical estimates and projections are grounded in reality.', 'Data for small areas is absolutely necessary.', 'Without it, local authorities would be unable to plan services, target resources and measure performance effectively.', 'The cost to the country would exceed the cost of a decennial census.', 'Would suggest that both options should be carried out, if possible, rather than one or the other.', 'That way, we get the accurate picture every ten years, and a good/ useful indication of trends throughout the in

[None, None]

In [91]:
analyser.polarity_scores("I love you very much.")

{'compound': 0.6369, 'neg': 0.0, 'neu': 0.417, 'pos': 0.583}

In [93]:
analyser.polarity_scores("But I hate you too a bit.")

{'compound': -0.5719, 'neg': 0.481, 'neu': 0.519, 'pos': 0.0}

In [94]:
analyser.polarity_scores("I love you very much. But I hate you too a bit.")

{'compound': 0.0609, 'neg': 0.261, 'neu': 0.462, 'pos': 0.277}

In [96]:
analyser.polarity_scores()

TypeError: polarity_scores() takes 2 positional arguments but 3 were given

In [121]:
[print(str(s)) for s in sent_tokenise_answer(cons1_df, idx_Q1)]

[]
[]
[]
['Moving to a primarily online census: an inevitable and necessary evolution of the existing approach.', 'Admin data and surveys: an unknown quantity, dependent on the quality of admin data, and not clear how well it would fulfil the primary aim of a census: to produce an accurate and independent estimate of the size and composition of the population.']
['A regular full population census is absolutely necessary.', 'It is the only way to ensure that all population and social statistical estimates and projections are grounded in reality.', 'Data for small areas is absolutely necessary.', 'Without it, local authorities would be unable to plan services, target resources and measure performance effectively.', 'The cost to the country would exceed the cost of a decennial census.', 'Would suggest that both options should be carried out, if possible, rather than one or the other.', 'That way, we get the accurate picture every ten years, and a good/ useful indication of trends througho

[None, None, None, None, None]

In [127]:
[print(analyser.polarity_scores(s)) for s in ["Admin data and surveys: an unknown quantity, dependent on the quality of admin data, and not clear how well it would fulfil the primary aim of a census: to produce an accurate and independent estimate of the size and composition of the population.", "The ability to do that would be severely hampered is detailed small area population data were not available."]]

{'neg': 0.091, 'neu': 0.909, 'pos': 0.0, 'compound': -0.4585}
{'neg': 0.141, 'neu': 0.751, 'pos': 0.108, 'compound': -0.1779}


[None, None]