In [6]:
import pandas as pd, os, json, numpy as np
from datetime import datetime as dt

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

import nltk
nltk.download('vader_lexicon')

from constants import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mkrajcovic\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
# import
data = pd.read_csv(os.path.join('..', 'data', 'results-raw.csv'))
notes = pd.read_csv(os.path.join('..', 'data', 'results-notes.csv'))

# Data preparation

In [8]:
# define columns
answer_columns = list(filter(lambda x: x.startswith(('T', 'pre', 'post')) and not x.endswith('Q'), data.columns))
note_columns = list(filter(lambda x: x.endswith('Note'), notes.columns))

In [9]:
# word counts
word_counts = pd.DataFrame(
    data={
        'id': data.id, 
        'pre1WordCount': 0, 'pre1F1WordCount': 0, 'pre1F2WordCount': 0, 'pre1F3WordCount': 0,
        'pre2WordCount': 0, 'pre2F1WordCount': 0, 'pre2F2WordCount': 0, 'pre2F3WordCount': 0,
        'T1Q1WordCount': 0, 'T1Q1F1WordCount': 0, 'T1Q1F2WordCount': 0, 'T1Q1F3WordCount': 0,
        'T1Q2WordCount': 0, 'T1Q2F1WordCount': 0, 'T1Q2F2WordCount': 0, 'T1Q2F3WordCount': 0,
        'T1Q3WordCount': 0, 'T1Q3F1WordCount': 0, 'T1Q3F2WordCount': 0, 'T1Q3F3WordCount': 0,
        'T2Q1WordCount': 0, 'T2Q1F1WordCount': 0, 'T2Q1F2WordCount': 0, 'T2Q1F3WordCount': 0,
        'T2Q2WordCount': 0, 'T2Q2F1WordCount': 0, 'T2Q2F2WordCount': 0, 'T2Q2F3WordCount': 0,
        'T2Q3WordCount': 0, 'T2Q3F1WordCount': 0, 'T2Q3F2WordCount': 0, 'T2Q3F3WordCount': 0,
        'post1WordCount': 0, 'post1F1WordCount': 0, 'post1F2WordCount': 0, 'post1F3WordCount': 0,
    }, 
    index=data.index
)

# sentiments
sentiments = pd.DataFrame(
    data={
        'id': data.id, 
        'pre1Sentiment': 0, 'pre1F1Sentiment': 0, 'pre1F2Sentiment': 0, 'pre1F3Sentiment': 0,
        'pre2Sentiment': 0, 'pre2F1Sentiment': 0, 'pre2F2Sentiment': 0, 'pre2F3Sentiment': 0,
        'T1Q1Sentiment': 0, 'T1Q1F1Sentiment': 0, 'T1Q1F2Sentiment': 0, 'T1Q1F3Sentiment': 0,
        'T1Q2Sentiment': 0, 'T1Q2F1Sentiment': 0, 'T1Q2F2Sentiment': 0, 'T1Q2F3Sentiment': 0,
        'T1Q3Sentiment': 0, 'T1Q3F1Sentiment': 0, 'T1Q3F2Sentiment': 0, 'T1Q3F3Sentiment': 0,
        'T2Q1Sentiment': 0, 'T2Q1F1Sentiment': 0, 'T2Q1F2Sentiment': 0, 'T2Q1F3Sentiment': 0,
        'T2Q2Sentiment': 0, 'T2Q2F1Sentiment': 0, 'T2Q2F2Sentiment': 0, 'T2Q2F3Sentiment': 0,
        'T2Q3Sentiment': 0, 'T2Q3F1Sentiment': 0, 'T2Q3F2Sentiment': 0, 'T2Q3F3Sentiment': 0,
        'post1Sentiment': 0, 'post1F1Sentiment': 0, 'post1F2Sentiment': 0, 'post1F3Sentiment': 0,
    }, 
    index=data.index, dtype='float'
)

In [10]:
sia = SentimentIntensityAnalyzer()

# calculate word counts and sentiment
for index, row in data.iterrows():
    for column in answer_columns:
        count = 0
        sentiment = None
        if(type(row[column]) == str):
            tokens = word_tokenize(row[column])
            tokens_no_punct = list(filter(lambda x: x.isalpha(), [word.lower() for word in tokens if word.isalpha()]))
            tokens_no_stop = [word for word in tokens_no_punct if not word in stopwords.words()]
            count = len(tokens_no_stop)
            sentiment = sia.polarity_scores(row[column])['compound']
        word_counts.loc[index, column + 'WordCount'] = count
        sentiments.loc[index, column + 'Sentiment'] = sentiment


In [11]:
# merge datasets
merged = pd.merge(
    notes, word_counts,
    left_on='id', right_on='id'
)

merged = pd.merge(
    merged, sentiments,
    left_on='id', right_on='id'
)

merged = pd.merge(
    data, merged,
    left_on='id', right_on='id'
)

In [12]:
# export results
merged.to_csv(os.path.join('..', 'data', 'results.csv'), index = False)