#### READ ME

In this script, we perform a basic Sentiment Analysis of the answers to the consultation questions 1, 4, 5, and 8, using Python's Vader module.

#### 1. Imports and Set Up

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
# Set up working directory

cwd = os.chdir('/Users/alessia/Documents/DataScience/NLP_Project/Data')

#### 2. Get Data

In [4]:
# Read in data (note header is spread over two rows)

cons0_df = pd.read_excel("The CensusCopy.xlsx",  header=None)

#### 3. Transform Data

3.1. Combine the headers - now in two rows - into one unique row

In [5]:
# Explore data

cons0_df.head(3);

In [6]:
print( cons0_df.values.shape )  # (1110, 50)

(1110, 50)


In [7]:
# Row 1: 

# propagate non-null values forward, so that if a cell contains a NaN, the cell gets the value of the cell before

row1 = cons0_df.ffill(1).values[:1, :]  

In [8]:
# Checks
print(row1.ndim)
print(row1.shape)          # (1,50)
print(row1[:, [0, -1]])    # print first and last values

2
(1, 50)
[['Respondent ID'
  '9. Are there any other issues that you believe we should be taking into account?']]


In [9]:
# Row 2: 

# replace NaN with empty cell (otherwise they will be float object, we want a list of only strings)

row2 = cons0_df.fillna('').values[1:2, :] 

In [10]:
#Checks
print(type(row2))
print(row2.ndim)
print(row2.shape)  # (1,50)
print(row2[:, [0, -1]])

<class 'numpy.ndarray'>
2
(1, 50)
[['' 'Open-Ended Response']]


In [11]:
# Combine row1 and row2 into one unique "header" row

header_row = row1 + row2

3.2. Reconstruct the dataframe

In [12]:
# Save header_row as DataFrame
header_row_df = pd.DataFrame(header_row)

# Save all other rows as dataframe
data_values_df = pd.DataFrame(cons0_df.values[2:, :])


In [13]:
# Append the two together
cons1_df = header_row_df.append(data_values_df,  
                                ignore_index=True
                               )

In [14]:
# Make first row as header
cons1_df.columns = cons1_df.iloc[0]

# Drop the first row (which is now redundant)
cons1_df = cons1_df.drop(0)

In [15]:
# Reset index 
cons1_df = cons1_df.reset_index(drop=True)    

In [16]:
# Checks
print(cons1_df.columns.values[:8])
print(cons1_df.columns.values[-1:])

['Respondent ID' 'Collector ID' 'Start Date' 'End Date' 'IP Address'
 'Email Address' 'First Name' 'Last Name']
[ '9. Are there any other issues that you believe we should be taking into account?Open-Ended Response']


#### 4. Sentiment Analysis of questions 1, 4, 5 and 8

4.1. Define function to calculate polarity score for the answers in our dataset

In [17]:
# Define function to calculate polarity score for the answers in our dataset

# import key modules
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    analyser = SentimentIntensityAnalyzer()
    

def get_sentiment_score(data, col_ind) :
    """ Return list of polarity scores for values in the specified column """
    
    # empty list collector of scores
    sentiment_bag = []
    
    for answer in data.iloc[:, col_ind] :
        
        # no answer was provided, return NA
        if pd.isnull(answer) : 
            sentiment_bag.append(np.nan)
            
        else :
            sentiment_bag.append(analyser.polarity_scores(answer)['compound'])
    
    return(sentiment_bag)
    

4.2. Calculate Sentiment Score for answers to relevant questions: Q1, Q4, Q5, Q8

In [18]:
# Get column index of questions

idx_Q1 = cons1_df.columns.get_loc(str([col for col in cons1_df if 'census methods' in str(col)][0]))
idx_Q4 = cons1_df.columns.get_loc(str([col for col in cons1_df if '4. 1. ' in str(col)][0]))
idx_Q5 = cons1_df.columns.get_loc(str([col for col in cons1_df if '5. 1.' in str(col)][0]))
idx_Q8 = cons1_df.columns.get_loc(str([col for col in cons1_df if '8.' in str(col)][0]))


In [19]:
# Checks
idx_Q1, idx_Q4, idx_Q5, idx_Q8

(39, 43, 45, 48)

In [20]:
# Calculate and save the Sentiment Score as new columns in the dataset

cons1_df.loc[:, ('Q1_Sentiment')] = get_sentiment_score(cons1_df, idx_Q1)
cons1_df.loc[:, ('Q4_Sentiment')] = get_sentiment_score(cons1_df, idx_Q4)
cons1_df.loc[:, ('Q5_Sentiment')] = get_sentiment_score(cons1_df, idx_Q5)
cons1_df.loc[:, ('Q8_Sentiment')] = get_sentiment_score(cons1_df, idx_Q8)



In [22]:
# Take a look at the result
cons1_df.iloc[:, [idx_Q1, -4, idx_Q4, -3, idx_Q5, -2, idx_Q8, -1]];

In [23]:
# Summary satistics
cons1_df.iloc[:, [idx_Q1, -4, idx_Q4, -3, idx_Q5, -2, idx_Q8, -1]].describe()

Unnamed: 0,Q1_Sentiment,Q4_Sentiment,Q5_Sentiment,Q8_Sentiment
count,736.0,523.0,396.0,490.0
mean,0.388333,0.07341,0.33806,0.092375
std,0.523643,0.515057,0.425649,0.575458
min,-0.9817,-0.983,-0.9042,-0.9691
25%,0.0,-0.3182,0.0,-0.3612
50%,0.4939,0.0,0.4404,0.0
75%,0.866425,0.4404,0.690275,0.6339
max,0.9998,0.9999,0.9954,0.9988


In [26]:
# Save dataset

cons1_df.to_csv('/Users/alessia/Documents/DataScience/NLP_Project/Outputs/cons1_SA_df.csv', encoding='utf-8')