In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

import csv
from collections import Counter

In [2]:
def distribution(records, question):
    "Get distribution of answers, for a given question."
    c = Counter(record[question] for record in records)
    total = sum(c.values())
    empty = c['']
    counts = {key: {"number": value, 
                    "percentage": (value/total) * 100, 
                    "percentage_answered": (value/(total-empty)) * 100} 
                for key, value in c.items()}
    try:
        del counts['']['percentage_answered']
    except:
        pass
    return counts


def get_questions(question, number):
    "Get questions for a range of questions in a grid."
    texts = []
    for i in range(1,number+1):
        item = f'Q{question}_{i}'
        text = questions[item]
        text = text.split('-')[-1].strip()
        texts.append(text)
    return texts


def get_texts(records, question):
    "Get answer texts."
    texts = []
    for record in records:
        answer = record[question]
        identifier = record['ResponseId']
        if not answer == '':
            texts.append([identifier, answer])
    return texts


def basic_stats(records, question):
    "Print basic statistics about the results."
    counts = distribution(records, question)
    for key, results in counts.items():
        if not key == '':
            print(f"{key}: {results['number']} ({results['percentage_answered']:.2f}%)")

    
def underscored(base, number, records):
    "Get answer distribution for all subquestions."
    results = dict()
    for i in range(1, number+1):
        question = f"Q{base}_{i}"
        results[question] = distribution(records, question)
    return results


def agreement(counts):
    "Select percentage answered for all answers except the empty string."
    results = dict()
    for answer in ['Strongly disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Strongly agree']:
        try:
            results[answer] = counts[answer]['percentage_answered']
        except:
            results[answer] = 0
    return results


# No longer needed:
# def enumerate_ids(iterable):
#     "Enumerate iterable with zero-padded IDs."
#     for i, element in enumerate(iterable):
#         yield 'comment-' + str(i).zfill(3), element


def write_texts(texts, filename):
    "Write texts from a list to a file."
    with open('./texts/' + filename,'w') as f:
        writer = csv.writer(f)
        writer.writerow(['identifier', 'comment', 'code'])
        writer.writerows([row + ['Original comment'] for row in texts])

In [3]:
df = pd.read_excel("[Distributed] Perceptions of Error Analysis_February 21, 2022_06.54.xlsx")
df = df.fillna('')
records = df.to_dict("records")

  warn("Workbook contains no default style, apply openpyxl's default")


In [4]:
consented = [record for record in records if str(record['Q1 ']).startswith("Yes")]

print(len(consented))

# For subgroup analysis:
academia = [record for record in records if str(record['Q2'])=='Academia']
industry = [record for record in records if str(record['Q2'])=='Industry']

60


In [5]:
# If necessary, here are all questions:
questions = records[0]

In [6]:
questions

{'StartDate': 'Start Date',
 'EndDate': 'End Date',
 'Status': 'Response Type',
 'Progress': 'Progress',
 'Duration (in seconds)': 'Duration (in seconds)',
 'Finished': 'Finished',
 'RecordedDate': 'Recorded Date',
 'ResponseId': 'Response ID',
 'DistributionChannel': 'Distribution Channel',
 'UserLanguage': 'User Language',
 'Q1 ': 'Informed consent\n\n \n\nThis is the consent form for our study about the status of error analysis in NLG. Full details about this study were provided on the previous page. If you want to read this information again, you can go back to the previous page. If anything is still unclear about this study, please contact: C.W.J.vanMiltenburg@tilburguniversity.edu\n\n \n\nConsent\n\nBy consenting, you indicate that you have read the description on the previous page, that you are voluntarily taking part in this study, and that you allow for your data to be processed. This means that:\n\n\n\tYou agree to your responses being anonymously recorded.\n\tYour answers wi

In [7]:
"""
TODO:
- Subgroup analysis: academia vs industry
- Heatmap tables
"""

'\nTODO:\n- Subgroup analysis: academia vs industry\n- Heatmap tables\n'

# Demographics

In [8]:
# Where do people come from?
basic_stats(consented, "Q2")

Academia: 45 (83.33%)
Industry: 8 (14.81%)
Other: 1 (1.85%)


In [9]:
# Time spent working in NLG:
basic_stats(consented, "Q3")

6-10 years: 5 (9.43%)
Less than 2 years: 11 (20.75%)
2-5 years: 22 (41.51%)
11 or more years: 10 (18.87%)
I don't work in NLG: 5 (9.43%)


In [10]:
# Read an error analysis:
basic_stats(consented, "Q4")

Yes: 29 (64.44%)
No: 16 (35.56%)


In [11]:
# Is it surprising that you haven't read an error analysis?
basic_stats(consented, "Q8")

Yes, because:: 3 (42.86%)
No, because:: 4 (57.14%)


In [12]:
# Why is it surprising?:
texts = get_texts(consented, 'Q8_1_TEXT')
write_texts(texts, "surprising_because.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

TypeError: can only concatenate tuple (not "list") to tuple

In [None]:
# Why is it not surprising?:
texts = get_texts(consented, 'Q8_2_TEXT')
write_texts(texts, "not_surprising_because.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Carried out an error analysis:
basic_stats(consented, 'Q9')

In [None]:
# Willing to carry one out again (only people who answered 'yes'):
basic_stats(consented, 'Q28')

In [None]:
# Explanation for previous question
texts = get_texts(consented, 'Q29')
write_texts(texts, "carry_out_again_because.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Considered carrying one out (only people who answered 'no'):
basic_stats(consented, 'Q12')

In [None]:
# Willing to carry one out (only people who answered 'no'):
basic_stats(consented, 'Q14')

In [None]:
# Reasons for not doing it:
texts = get_texts(consented, 'Q13')
write_texts(texts, "reason_for_not_carrying_out.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

# Usefulness of error analyses

In [None]:
# Found useful:
basic_stats(consented, 'Q5')

In [None]:
# What was useful about the analyses?:
texts = get_texts(consented, 'Q6')
write_texts(texts, "uses_of_error_analysis.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# For what kinds of papers are error analyses useful?:
texts = get_texts(consented, 'Q15')
write_texts(texts, "kinds_of_papers.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Reasons for disappointment:
texts = get_texts(consented, 'Q7')
write_texts(texts, "reasons_for_disappointment.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

# Barriers and enabling factors

In [None]:
# Challenges:
texts = get_texts(consented, 'Q10')
write_texts(texts, "challenges.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Enough resources/reference materials at the time?
basic_stats(consented,'Q11')

In [None]:
answers = ['Strongly disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Strongly agree']
records = []
for question, counts in underscored(16,9,consented).items():
    for answer in answers:
        percentage = 0
        if answer in counts:
            percentage = counts[answer]['number'] # NOTE: Changed into number rather than percentage!
        record = dict(question=question, answer=answer, percentage=percentage)
        records.append(record)

df = pd.DataFrame(records)
# Pivot to make a square table:
df = df.pivot(index='question', columns='answer', values='percentage')
# Reorder columns:
df = df[['Strongly disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Strongly agree']]

plt.rcParams["figure.figsize"] = (15,3)
ax = sns.heatmap(df,cmap=sns.light_palette("seagreen", as_cmap=True),linewidth=1,cbar=False,annot=True)
ax.xaxis.tick_top()
plt.xticks(np.arange(5) + 0.5, labels=answers)
plt.yticks(np.arange(9) + 0.5, labels=get_questions(16,9))
plt.tick_params(top=False,left=False)
plt.xlabel('')
plt.ylabel('')
plt.title("I would be more likely to carry out an analysis in a conference/journal paper if…", y=1.2)
plt.tight_layout()
plt.savefig("Q16.pdf")

In [None]:
# Other barriers?
texts = get_texts(consented, 'Q17')
write_texts(texts, "other_barriers.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Enough resources/reference materials currently?
basic_stats(consented,'Q20')

In [None]:
# What is still missing?
texts = get_texts(consented, 'Q20_2_TEXT')
write_texts(texts, "missing.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

In [None]:
# Other factors that make it more likely for you to carry out an error analysis?
texts = get_texts(consented, 'Q21')
write_texts(texts, "enabling.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

# General opinions

In [None]:
answers = ['Strongly disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Strongly agree']
records = []
for question, counts in underscored(18,9,consented).items():
    for answer in answers:
        percentage = 0
        if answer in counts:
            percentage = counts[answer]['number']
        record = dict(question=question, answer=answer, percentage=percentage)
        records.append(record)

df = pd.DataFrame(records)
# Pivot to make a square table:
df = df.pivot(index='question', columns='answer', values='percentage')
# Reorder columns:
df = df[['Strongly disagree', 'Somewhat disagree', 'Neither agree nor disagree', 'Somewhat agree', 'Strongly agree']]

plt.rcParams["figure.figsize"] = (15,4)
ax = sns.heatmap(df,cmap=sns.light_palette("seagreen", as_cmap=True),linewidth=1,cbar=False,annot=True)
ax.xaxis.tick_top()
plt.xticks(np.arange(5) + 0.5, labels=answers)
plt.yticks(np.arange(9) + 0.5, labels=get_questions(18,9))
plt.tick_params(top=False,left=False)
plt.xlabel('')
plt.ylabel('')
plt.title("...", y=1.2)
plt.tight_layout()
plt.savefig("Q18.pdf")

In [None]:
# More/less/equally likely to include error analysis in journal
basic_stats(consented, 'Q19')

In [None]:
# Explanation for previous question:
texts = get_texts(consented, 'Q27')
write_texts(texts, "explanation_journal_preference.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

# Requirements for reports of error analyses

In [None]:
texts = get_texts(consented, 'Q23')
write_texts(texts, "reporting_requirements.csv")

for ident, text in texts:
    print(ident, text)
    print('----')

# General comments

In [None]:
texts = get_texts(consented, 'Q24')
write_texts(texts, "general_comments.csv")

for ident, text in texts:
    print(ident, text)
    print('----')