#### INTRO

In this script, we POS tag and lemmatise the answers to the consultation. Steps:

1. Text cleaning: remove punctuation, non-alphabetic tokens, and specified stopwords (from the set of stopwords we exclude negations ('no', 'nor', 'not') and other words that may be used to express opnions ('only', 'up', 'down', 'further', 'too', 'against').

2. We replace nagation forms of auxaliary and modal verbs (by default contained in the stop words set for English) with negation

3. We POS tag the answer using Treebank POS

4. We translate Treebank POS tags into WorldNet POS tags

5. We lemmatise the answers using WorldNet lemmatizer 


#### Imports and Set Up

In [1]:
#### Imports and Set Up

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from nltk import pos_tag

from nltk.stem import PorterStemmer, WordNetLemmatizer
 
porter_stemmer = PorterStemmer()
wordnet_lemmatiser = WordNetLemmatizer()




In [2]:
cwd = os.chdir("/Users/alessia/Documents/DataScience/NLP_Project/Data")
cwd

#### Get Data

In [3]:
cons1_df = pd.read_csv("/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_cleantext_SA_df.csv")

#### POS tagging answers

In [4]:
# Get columns' index
idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))
idx_Q4 = cons1_df.columns.get_loc(str([col for col in cons1_df if '4. 1. ' in str(col)][0]))
idx_Q5 = cons1_df.columns.get_loc(str([col for col in cons1_df if '5. 1.' in str(col)][0]))
idx_Q8 = cons1_df.columns.get_loc(str([col for col in cons1_df if '8.' in str(col)][0]))

In [5]:
# Save them in a dictionary
col_idx_dict = {"Q1":idx_Q1, "Q4":idx_Q4, "Q5":idx_Q5, "Q8":idx_Q8}

print(col_idx_dict)
print(col_idx_dict.items())
print(col_idx_dict.values())

{'Q1': 41, 'Q4': 45, 'Q5': 47, 'Q8': 50}
dict_items([('Q1', 41), ('Q4', 45), ('Q5', 47), ('Q8', 50)])
dict_values([41, 45, 47, 50])


In [132]:
# Define function to tag Part-of-Speech of text answers

import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk import pos_tag
    

def tokenise_POS_text(data, col_ind, stop_words, no_stopwords = True, no_punctuation = True, keep_neg=True) :
    """Return a list with POS tags of specified data columns containing text after
    removing punctuation (default), non-alphabetic tokens, specified stopwords but keeping negations in """
    
    # empty list collector
    tokens_bag = []
    
    for answer in data.iloc[:, col_ind] :   
        
        # no answer was provided, return NA
        if pd.isnull(answer) : 
            tokens_bag.append(np.nan)
            
        # an answer was provided    
        else : 
                
            # word-tokenise the answer
            words = word_tokenize(answer)
        
            # convert to lower case
            words = [w.lower() for w in words]
            
            
            # break words that are of the form word1-word2 into constituting words
            
            words2 = []
            
            for w in words :
                
                if '-' in w :
                    
                    words2.extend(w.split('-'))
                    
                else :
                    
                    words2.append(w)
                
            
            if no_punctuation : # Remove punctuation if no_punctuation = True
                
                # remove punctuation 
                table = str.maketrans('', '', string.punctuation)
                words = [w.translate(table) for w in words2]
                
                # remove remaining tokens that are not alphabetic
                only_words = [w for w in words2 if w.isalpha()]
                
                 
            
            if keep_neg :       # we want to keep in all negations
                
                for w in only_words :
                    
                    if w in ["don't", "didn", "didn't", "doesn", "doesn't", 'hadn', "hadn't", 'hasn', 
                             "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 
                             'mustn', "mustn't", 'needn', "needn't", "shan't", 'shouldn', 
                             "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
                             'wouldn', "wouldn't", 'aren', "aren't", 'couldn', "couldn't"] :
                        
                        w = 'not'
                        
                    else :
                        
                        w = w
                        
                        
            if no_stopwords :    # Remove stop-words if no_stopwirds = True
                
                # filter out stop words from each answer
                only_words = [w for w in only_words if not w in stop_words]
                
           
            
            # calculate Part-Of-Speech
            pos_answer = pos_tag(only_words)

            tokens_bag.append(pos_answer)
    
    return(tokens_bag)

In [95]:
#batch_pos_tag(cons1_df['Q1'])   # ERROR name 'batch_pos_tag' is not defined 

In [148]:
stopwords.words('english');

In [97]:
# Customise list of stop words

stop_words = stopwords.words('english')

# Let's exclude (i.e., we want to keep them in the text):
# negations
# only, up, down, and other opinion-realted words


stop_words1 = [w for w in stop_words if not w in ['no', 'nor', 'not', 'only', 'up', 'down', 'further', 'too', 'against']]

In [133]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in col_idx_dict.items() :

    result = tokenise_POS_text(cons1_df, idx, stop_words=stop_words1)
    new_q = q + '_pos'
    
    #print(type(result))
    #print(type(cons1_df.iloc[:, idx]))
    #print(type(new_q))
    
    se_result = pd.Series(result)      # had to turn this into a Pandas series first, otherwise ERROR
    #print(se_result[1:5])
    #print(cons1_df.iloc[:, idx].head())
    
    cons1_df.loc[:, new_q] = se_result


In [136]:
# Checks
pd.set_option('display.max_colwidth', -1);
cons1_df.iloc[41:42, [idx_Q1, -8]];

#### Lemmatisation of answers

In [137]:
# Get columns' index of POS-tagged answers
idx_Q1p = cons1_df.columns.get_loc('Q1_pos')
idx_Q4p = cons1_df.columns.get_loc('Q4_pos')
idx_Q5p = cons1_df.columns.get_loc('Q5_pos')
idx_Q8p = cons1_df.columns.get_loc('Q8_pos')

In [138]:
# Save them in a dictionary
colpos_idx_dict = {"Q1_pos":idx_Q1p, "Q4_pos":idx_Q4p, "Q5_pos":idx_Q5p, "Q8_pos":idx_Q8p}

In [139]:
# Replace float nan's with empty srings

# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :
    
    cons1_df.iloc[:, idx] = cons1_df.iloc[:, idx].replace(np.nan, '', regex=True)


In [149]:
type(cons1_df.iloc[:, idx].head(4));
cons1_df.iloc[:,].head(4);

In [141]:
# TBC : should impement something like this...
# https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

# The following function would map the treebank tags to WordNet part of speech names:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return ''

In [142]:
get_wordnet_pos('MD')

''

In [143]:
# Create new dataset columns containing the POS-tagged texts

for q, idx in colpos_idx_dict.items() :

    # set new variable name
    new_q = q + '_lemma'
    
    
    # extract columns with answers
    answer_col = cons1_df.iloc[:, idx]
    
    answer_col = answer_col.tolist()
    #print(type(answer_col))
    #print(answer_col[1:4])
    
    
    # collector for all answers within that answer_col
    lemma_big_bag = []
    
    
    for answer in answer_col :
        
        lemma_bag = []
        
        #print(answer)
        #print(len(answer))
        
        # an answer was provided
        if len(answer) > 0 :
            
            for POStext_pair in answer :
                
                #print(POStext_pair[0])
                #print(POStext_pair[1])
                
                #print(type(POStext_pair[0]))
                #print(type(POStext_pair[1]))
                
                #print('wordnet pos = ' + get_wordnet_pos(POStext_pair[1]))
                #print('type wordnet pos = ' + str(type(get_wordnet_pos(POStext_pair[1]))))
                
                #print( get_wordnet_pos(POStext_pair[1]) == '')
                
                
                # the treebank POS does not have a wordnet POS equivalent
                if get_wordnet_pos(POStext_pair[1]) == '' :
                    
                    lemma = POStext_pair[0]
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                
                # the treebank POS does have a wordnet POS equivalent
                else :
                    
                    lemma = wordnet_lemmatiser.lemmatize(POStext_pair[0], pos=get_wordnet_pos(POStext_pair[1]))
                
                    #print('lemma = ' + lemma)
                    #print(type(lemma))
                    
                
                lemma_bag.append(lemma)
                #print(lemma_bag)
                #print(type(lemma_bag))
                
        else :
            
            lemma_bag.append(str(""))
        
        
        lemma_big_bag.append(lemma_bag)
        
    
    
    se_lemma_result = pd.Series(lemma_big_bag)      # had to turn this into a Pandas series first, otherwise ERROR
    
    cons1_df.loc[:, new_q] = se_lemma_result



In [150]:
cons1_df.head(4);

In [151]:
# Save dataset

cons1_df.to_csv('/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_lemmas_df.csv', encoding='utf-8')