#### Imports and Set Up

In [1]:
#### Imports and Set Up

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from nltk import pos_tag

from nltk.stem import PorterStemmer, WordNetLemmatizer
 
porter_stemmer = PorterStemmer()
wordnet_lemmatiser = WordNetLemmatizer()




In [2]:
cwd = os.chdir("/Users/alessia/Documents/DataScience/NLP_Project/Data")
cwd

#### Get Data

In [3]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_cleantext_SA_df.csv")

#### POS tagging answers

In [4]:
# Get columns' index
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))
idx_Q4 = cons1_df.columns.get_loc(str([col for col in cons1_df if '4. 1. ' in str(col)][0]))
idx_Q5 = cons1_df.columns.get_loc(str([col for col in cons1_df if '5. 1.' in str(col)][0]))
idx_Q8 = cons1_df.columns.get_loc(str([col for col in cons1_df if '8.' in str(col)][0]))

In [5]:
# Save them in a dictionary
col_idx_dict = {"Q1":idx_Q1, "Q4":idx_Q4, "Q5":idx_Q5, "Q8":idx_Q8}

print(col_idx_dict)
print(col_idx_dict.items())
print(col_idx_dict.values())

{'Q1': 41, 'Q4': 45, 'Q5': 47, 'Q8': 50}
dict_items([('Q1', 41), ('Q4', 45), ('Q5', 47), ('Q8', 50)])
dict_values([41, 45, 47, 50])


In [6]:
# Define function to tag Part-of-Speech of text answers

def tokenise_POS_text(data, col_ind, stop_words, no_stopwords = True, no_punctuation = True) :
    """Return a list with POS tags of specified data columns containing text after
    removing punctuation (default) and non-alphabetic tokens"""
    
    import pandas as pd
    from nltk.tokenize import word_tokenize
    
    from nltk import pos_tag
    
    # empty list collector
    tokens_bag = []
    
    for answer in data.iloc[:, col_ind] :   
        
        # no answer was provided, return NA
        if pd.isnull(answer) : 
            tokens_bag.append(np.nan)
            
        # an answer was provided    
        else : 
                
            # word-tokenise the answer
            words = word_tokenize(answer)
        
            # convert to lower case
            words = [w.lower() for w in words]
            
            
            if no_punctuation : # no_punctuation = True
                
                # remove punctuation 
                import string
                table = str.maketrans('', '', string.punctuation)
                words = [w.translate(table) for w in words]
                
                # remove remaining tokens that are not alphabetic
                only_words = [w for w in words if w.isalpha()]
                
            #else :
            #    continue
                
            
            ### THIS DOES NOT WORK... ###
            if no_stopwords :    # no_stopwirds = True
                
                # filter out stop words from each answer
                only_words = [w for w in only_words if not w in stop_words]
                
            #else :   
            #    continue
            
            
            # calculate Part-Of-Speech
            pos_answer = pos_tag(only_words)

            tokens_bag.append(pos_answer)
    
    return(tokens_bag)

In [7]:
#batch_pos_tag(cons1_df['Q1'])   # ERROR name 'batch_pos_tag' is not defined 

In [8]:
#tokenise_POS_text(cons1_df.iloc[:, ], col_ind=idx_Q8);

In [9]:
# Set stop words
stop_words = set(stopwords.words('english'))
stop_words.update('would', 'could', 'might', 'may')

# do we want to keep in "no"? "yes" is not a stopword...

In [10]:
print(type(stopwords.words('english')))
stop_words;

<class 'list'>


In [11]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in col_idx_dict.items() :

    result = tokenise_POS_text(cons1_df, idx, stop_words=stop_words)
    new_q = q + '_pos'
    
    #print(type(result))
    #print(type(cons1_df.iloc[:, idx]))
    #print(type(new_q))
    
    se_result = pd.Series(result)      # had to turn this into a Pandas series first, otherwise ERROR
    #print(se_result[1:5])
    #print(cons1_df.iloc[:, idx].head())
    
    cons1_df.loc[:, new_q] = se_result


In [12]:
# Checks
cons1_df.columns.values
cons1_df.head(6)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Respondent ID,Collector ID,Start Date,End Date,IP Address,Email Address,First Name,Last Name,...,Q5_clean,Q8_clean,Q1_cl_sentiment,Q4_cl_sentiment,Q5_cl_sentiment,Q8_cl_sentiment,Q1_pos,Q4_pos,Q5_pos,Q8_pos
0,0,0,3001215611,45151668,2014-01-05 02:42:21,2014-01-05 02:44:13,49.224.154.245,,,,...,,,,,,,,,,
1,1,1,3001062135,45151668,2014-01-04 21:34:56,2014-01-04 21:35:12,79.69.231.100,,,,...,,,,,,,,,,
2,2,2,2990699680,45151668,2013-12-23 16:54:29,2013-12-23 17:00:18,109.148.186.17,,,,...,,,,,,,,,,
3,3,3,2990403881,45151668,2013-12-23 12:17:33,2013-12-23 12:29:22,217.36.37.20,,,,...,date statistics postcode sector equivalent lev...,essential changes census methodology thoroughl...,0.5719,0.6486,0.4404,0.891,"[(moving, VBG), (primarily, RB), (online, JJ),...","[(important, JJ), (census, NN), (provide, VBP)...","[(date, NN), (statistics, NNS), (postcode, VBP...","[(essential, JJ), (changes, NNS), (census, VBP..."
4,4,4,2985513376,45151668,2013-12-19 11:35:42,2013-12-19 11:43:35,86.12.129.3,,,,...,would allow council respond effectively changi...,measures must put place ensure one excluded on...,0.9848,0.836,0.959,0.4939,"[(regular, JJ), (full, JJ), (population, NN), ...","[(would, MD), (lose, VB), (ability, NN), (unde...","[(would, MD), (allow, VB), (council, NN), (res...","[(measures, NNS), (must, MD), (put, VB), (plac..."
5,5,5,2983385436,45151668,2013-12-18 11:07:44,2013-12-18 16:42:33,46.33.158.20,,,,...,,users census place premium current model howev...,0.9648,,,0.9657,"[(privacy, NN), (clear, JJ), (concern, NN), (w...",,,"[(users, NNS), (census, VBP), (place, NN), (pr..."


#### Lemmatisation of answers

In [13]:
# Get columns' index of POS-tagged answers
idx_Q1p = cons1_df.columns.get_loc('Q1_pos')
idx_Q4p = cons1_df.columns.get_loc('Q4_pos')
idx_Q5p = cons1_df.columns.get_loc('Q5_pos')
idx_Q8p = cons1_df.columns.get_loc('Q8_pos')

In [14]:
# Save them in a dictionary
colpos_idx_dict = {"Q1_pos":idx_Q1p, "Q4_pos":idx_Q4p, "Q5_pos":idx_Q5p, "Q8_pos":idx_Q8p}

In [15]:
# Replace float nan's with empty srings

# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :
    
    cons1_df.iloc[:, idx] = cons1_df.iloc[:, idx].replace(np.nan, '', regex=True)


In [16]:
print(type(cons1_df.iloc[:, idx].head(4)))
print(cons1_df.iloc[:,].head(4))

<class 'pandas.core.series.Series'>
   Unnamed: 0  Unnamed: 0.1  Respondent ID  Collector ID           Start Date  \
0           0             0     3001215611      45151668  2014-01-05 02:42:21   
1           1             1     3001062135      45151668  2014-01-04 21:34:56   
2           2             2     2990699680      45151668  2013-12-23 16:54:29   
3           3             3     2990403881      45151668  2013-12-23 12:17:33   

              End Date      IP Address  Email Address  First Name  Last Name  \
0  2014-01-05 02:44:13  49.224.154.245            NaN         NaN        NaN   
1  2014-01-04 21:35:12   79.69.231.100            NaN         NaN        NaN   
2  2013-12-23 17:00:18  109.148.186.17            NaN         NaN        NaN   
3  2013-12-23 12:29:22    217.36.37.20            NaN         NaN        NaN   

                         ...                          \
0                        ...                           
1                        ...                 

In [17]:
# TBC : should impement something like this...
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

# The following function would map the treebank tags to WordNet part of speech names:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return ''

In [18]:
get_wordnet_pos('MD')

''

In [27]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :

    # set new variable name
    new_q = q + '_lemma'
    
    
    # extract columns with answers
    answer_col = cons1_df.iloc[:, idx]
    
    answer_col = answer_col.tolist()
    #print(type(answer_col))
    #print(answer_col[1:4])
    
    
    # collector for all answers within that answer_col
    lemma_big_bag = []
    
    
    for answer in answer_col :
        
        lemma_bag = []
        
        #print(answer)
        #print(len(answer))
        
        # an answer was provided
        if len(answer) > 0 :
            
            for POStext_pair in answer :
                
                #print(POStext_pair[0])
                #print(POStext_pair[1])
                
                #print(type(POStext_pair[0]))
                #print(type(POStext_pair[1]))
                
                #print('wordnet pos = ' + get_wordnet_pos(POStext_pair[1]))
                #print('type wordnet pos = ' + str(type(get_wordnet_pos(POStext_pair[1]))))
                
                #print( get_wordnet_pos(POStext_pair[1]) == '')
                
                
                # the treebank POS does not have a wordnet POS equivalent
                if get_wordnet_pos(POStext_pair[1]) == '' :
                    
                    lemma = POStext_pair[0]
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                
                # the treebank POS does have a wordnet POS equivalent
                else :
                    
                    lemma = wordnet_lemmatiser.lemmatize(POStext_pair[0], pos=get_wordnet_pos(POStext_pair[1]))
                
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                lemma_bag.append(lemma)
                #print(lemma_bag)
                #print(type(lemma_bag))
                
        else :
            
            lemma_bag.append(str(""))
        
        
        lemma_big_bag.append(lemma_bag)
        
    
    
    se_lemma_result = pd.Series(lemma_big_bag)      # had to turn this into a Pandas series first, otherwise ERROR
    
    cons1_df.loc[:, new_q] = se_lemma_result



In [29]:
cons1_df.head(4)

   Unnamed: 0  Unnamed: 0.1  Respondent ID  Collector ID           Start Date  \
0           0             0     3001215611      45151668  2014-01-05 02:42:21   
1           1             1     3001062135      45151668  2014-01-04 21:34:56   
2           2             2     2990699680      45151668  2013-12-23 16:54:29   
3           3             3     2990403881      45151668  2013-12-23 12:17:33   

              End Date      IP Address  Email Address  First Name  Last Name  \
0  2014-01-05 02:44:13  49.224.154.245            NaN         NaN        NaN   
1  2014-01-04 21:35:12   79.69.231.100            NaN         NaN        NaN   
2  2013-12-23 17:00:18  109.148.186.17            NaN         NaN        NaN   
3  2013-12-23 12:29:22    217.36.37.20            NaN         NaN        NaN   

                         ...                          Q5_cl_sentiment  \
0                        ...                                      NaN   
1                        ...                   