#### Imports and Set Up

In [129]:
#### Imports and Set Up

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from nltk import pos_tag

from nltk.stem import PorterStemmer, WordNetLemmatizer
 
porter_stemmer = PorterStemmer()
wordnet_lemmatiser = WordNetLemmatizer()


In [2]:
cwd = os.chdir("/Users/alessia/Documents/DataScience/NLP_Project/Data")
cwd

#### Get Data

In [104]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_cleantext_SA_df.csv")

#### POS tagging answers

In [105]:
# Get columns' index
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))
idx_Q4 = cons1_df.columns.get_loc(str([col for col in cons1_df if '4. 1. ' in str(col)][0]))
idx_Q5 = cons1_df.columns.get_loc(str([col for col in cons1_df if '5. 1.' in str(col)][0]))
idx_Q8 = cons1_df.columns.get_loc(str([col for col in cons1_df if '8.' in str(col)][0]))

In [106]:
# Save them in a dictionary
col_idx_dict = {"Q1":idx_Q1, "Q4":idx_Q4, "Q5":idx_Q5, "Q8":idx_Q8}

print(col_idx_dict)
print(col_idx_dict.items())
print(col_idx_dict.values())

{'Q1': 41, 'Q4': 45, 'Q5': 47, 'Q8': 50}
dict_items([('Q1', 41), ('Q4', 45), ('Q5', 47), ('Q8', 50)])
dict_values([41, 45, 47, 50])


In [107]:
# Define function to tag Part-of-Speech of text answers

def tokenise_POS_text(data, col_ind, no_stopwords = False, no_punctuation = True) :
    """Return a list with POS tags of specified data columns containing text after
    removing punctuation (default) and non-alphabetic tokens"""
    
    import pandas as pd
    from nltk.tokenize import word_tokenize
    
    from nltk import pos_tag
    
    # empty list collector
    tokens_bag = []
    
    for answer in data.iloc[:, col_ind] :   
        
        # no answer was provided, return NA
        if pd.isnull(answer) : 
            tokens_bag.append(np.nan)
            
        # an answer was provided    
        else : 
                
            # word-tokenise the answer
            words = word_tokenize(answer)
        
            # convert to lower case
            words = [w.lower() for w in words]
            
            
            if no_punctuation : # no_punctuation = True
                
                # remove punctuation 
                import string
                table = str.maketrans('', '', string.punctuation)
                words = [w.translate(table) for w in words]
                
                # remove remaining tokens that are not alphabetic
                only_words = [w for w in words if w.isalpha()]
                
            else :
                
                continue
                
            
            ### THIS DOES NOT WORK... ###
            #if no_stopwords :    # no_stopwirds = True
                
                # filter out stop words from each answer
            #    only_words = [w for w in only_words if not w in stop_words]
                
            #else :
                
            #    continue
            
            
            # calculate Part-Of-Speech
            pos_answer = pos_tag(only_words)
            
                
            # untokenise the sentence: return one unique string for each answer in prep for sentiment analysis
            #from nltk.tokenize.moses import MosesDetokenizer
            #detokenizer = MosesDetokenizer()
        
            #filtered_answer = detokenizer.detokenize(pos_answer, return_str=True)

            tokens_bag.append(pos_answer)
    
    return(tokens_bag)

In [108]:
#batch_pos_tag(cons1_df['Q1'])   # ERROR name 'batch_pos_tag' is not defined 

In [109]:
#tokenise_POS_text(cons1_df.iloc[:, ], col_ind=idx_Q8);

In [110]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in col_idx_dict.items() :

    result = tokenise_POS_text(cons1_df, idx)
    new_q = q + '_pos'
    
    #print(type(result))
    #print(type(cons1_df.iloc[:, idx]))
    #print(type(new_q))
    
    se_result = pd.Series(result)      # had to turn this into a Pandas series first, otherwise ERROR
    #print(se_result[1:5])
    #print(cons1_df.iloc[:, idx].head())
    
    cons1_df.loc[:, new_q] = se_result


In [111]:
# Checks
cons1_df.columns.values
cons1_df.head(6)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Respondent ID,Collector ID,Start Date,End Date,IP Address,Email Address,First Name,Last Name,...,Q5_clean,Q8_clean,Q1_cl_sentiment,Q4_cl_sentiment,Q5_cl_sentiment,Q8_cl_sentiment,Q1_pos,Q4_pos,Q5_pos,Q8_pos
0,0,0,3001215611,45151668,2014-01-05 02:42:21,2014-01-05 02:44:13,49.224.154.245,,,,...,,,,,,,,,,
1,1,1,3001062135,45151668,2014-01-04 21:34:56,2014-01-04 21:35:12,79.69.231.100,,,,...,,,,,,,,,,
2,2,2,2990699680,45151668,2013-12-23 16:54:29,2013-12-23 17:00:18,109.148.186.17,,,,...,,,,,,,,,,
3,3,3,2990403881,45151668,2013-12-23 12:17:33,2013-12-23 12:29:22,217.36.37.20,,,,...,date statistics postcode sector equivalent lev...,essential changes census methodology thoroughl...,0.5719,0.6486,0.4404,0.891,"[(moving, VBG), (to, TO), (a, DT), (primarily,...","[(it, PRP), (is, VBZ), (important, JJ), (for, ...","[(up, RB), (to, TO), (date, NN), (statistics, ...","[(it, PRP), (is, VBZ), (essential, JJ), (that,..."
4,4,4,2985513376,45151668,2013-12-19 11:35:42,2013-12-19 11:43:35,86.12.129.3,,,,...,would allow council respond effectively changi...,measures must put place ensure one excluded on...,0.9848,0.836,0.959,0.4939,"[(a, DT), (regular, JJ), (full, JJ), (populati...","[(would, MD), (lose, VB), (the, DT), (ability,...","[(it, PRP), (would, MD), (allow, VB), (the, DT...","[(measures, NNS), (must, MD), (be, VB), (put, ..."
5,5,5,2983385436,45151668,2013-12-18 11:07:44,2013-12-18 16:42:33,46.33.158.20,,,,...,,users census place premium current model howev...,0.9648,,,0.9657,"[(privacy, NN), (is, VBZ), (a, DT), (clear, JJ...",,,"[(there, EX), (are, VBP), (some, DT), (users, ..."


#### Lemmatisation of answers

In [112]:
# Get columns' index of POS-tagged answers
idx_Q1p = cons1_df.columns.get_loc('Q1_pos')
idx_Q4p = cons1_df.columns.get_loc('Q4_pos')
idx_Q5p = cons1_df.columns.get_loc('Q5_pos')
idx_Q8p = cons1_df.columns.get_loc('Q8_pos')

In [113]:
# Save them in a dictionary
colpos_idx_dict = {"Q1_pos":idx_Q1p, "Q4_pos":idx_Q4p, "Q5_pos":idx_Q5p, "Q8_pos":idx_Q8p}

In [114]:
# Replace float nan's with empty srings

# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :
    
    cons1_df.iloc[:, idx] = cons1_df.iloc[:, idx].replace(np.nan, '', regex=True)


In [116]:
print(type(cons1_df.iloc[:, idx].head()))
print(cons1_df.iloc[:,].head())

<class 'pandas.core.series.Series'>
   Unnamed: 0  Unnamed: 0.1  Respondent ID  Collector ID           Start Date  \
0           0             0     3001215611      45151668  2014-01-05 02:42:21   
1           1             1     3001062135      45151668  2014-01-04 21:34:56   
2           2             2     2990699680      45151668  2013-12-23 16:54:29   
3           3             3     2990403881      45151668  2013-12-23 12:17:33   
4           4             4     2985513376      45151668  2013-12-19 11:35:42   

              End Date      IP Address  Email Address  First Name  Last Name  \
0  2014-01-05 02:44:13  49.224.154.245            NaN         NaN        NaN   
1  2014-01-04 21:35:12   79.69.231.100            NaN         NaN        NaN   
2  2013-12-23 17:00:18  109.148.186.17            NaN         NaN        NaN   
3  2013-12-23 12:29:22    217.36.37.20            NaN         NaN        NaN   
4  2013-12-19 11:43:35     86.12.129.3            NaN         NaN        NaN 

In [131]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :

    lemma_bag = []

    # set new variable name
    new_q = q + '_lemma'
    
    
    # extract columns with answers
    answer_col = cons1_df.iloc[:, idx]
    
    answer_col = answer_col.tolist()
    print(type(answer_col))
    #print(answer_col[1:4])
    
    
    
    for answer in answer_col :
        
        print(answer)
        print(len(answer))
        
        if answer == '' :
            continue
            
        else :
            
            for POStext_pair in answer :
                
                print(POStext_pair[0])
                print(POStext_pair[1])
                
                print(type(POStext_pair[0]))
                print(type(POStext_pair[1]))
                
                lemma = wordnet_lemmatiser.lemmatize(POStext_pair[0], pos=POStext_pair[1])
                
                print(lemma)
                #lemma_bag = lemma_bag.append(lemma)
                #print(lemma_bag)
                                         
        
        
    
    
    
    #se_result = pd.Series(result)      # had to turn this into a Pandas series first, otherwise ERROR
    
    #cons1_df.loc[:, new_q] = se_result



<class 'list'>

0

0

0
[('moving', 'VBG'), ('to', 'TO'), ('a', 'DT'), ('primarily', 'RB'), ('online', 'JJ'), ('census', 'NN'), ('an', 'DT'), ('inevitable', 'JJ'), ('and', 'CC'), ('necessary', 'JJ'), ('evolution', 'NN'), ('of', 'IN'), ('the', 'DT'), ('existing', 'VBG'), ('approach', 'NN'), ('admin', 'NN'), ('data', 'NNS'), ('and', 'CC'), ('surveys', 'NNS'), ('an', 'DT'), ('unknown', 'JJ'), ('quantity', 'NN'), ('dependent', 'NN'), ('on', 'IN'), ('the', 'DT'), ('quality', 'NN'), ('of', 'IN'), ('admin', 'NN'), ('data', 'NNS'), ('and', 'CC'), ('not', 'RB'), ('clear', 'JJ'), ('how', 'WRB'), ('well', 'RB'), ('it', 'PRP'), ('would', 'MD'), ('fulfil', 'VB'), ('the', 'DT'), ('primary', 'JJ'), ('aim', 'NN'), ('of', 'IN'), ('a', 'DT'), ('census', 'NN'), ('to', 'TO'), ('produce', 'VB'), ('an', 'DT'), ('accurate', 'NN'), ('and', 'CC'), ('independent', 'JJ'), ('estimate', 'NN'), ('of', 'IN'), ('the', 'DT'), ('size', 'NN'), ('and', 'CC'), ('composition', 'NN'), ('of', 'IN'), ('the', 'DT'), ('populati

KeyError: 'VBG'

In [None]:
# TBC : should impement something like this...
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''