In [None]:
import os
import re
import textwrap

from collections import Counter, defaultdict

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm, font_manager
import pandas as pd
import seaborn as sns

from wordcloud import WordCloud

from limesurvey_parser import LimeSurveyParser

with open(os.environ.get("FILE_TO_PROCESS", "survey-results.csv"), "r") as f:
    content = f.read()

parser = LimeSurveyParser(sep_header="%%%")

questions = parser.parse_questions(content)

In [None]:
plot_dir = os.environ.get("PLOT_DIR", "plots")
filetype = os.environ.get("EXPORT_FILETYPE", "svg")

fontname = os.environ.get("FONT_FAMILY", "Calibri")
plt.rcParams["font.family"] = fontname
base_font_size = int(os.environ.get("FONT_SIZE", 13))
plt.rcParams["font.size"] = base_font_size
sns.set_context(
    "notebook",
    rc={
        "xtick.labelsize": base_font_size,
        "ytick.labelsize": base_font_size,
        "font.size": base_font_size,
        "axes.titlesize": base_font_size + 1,
        "legend.fontsize": base_font_size,
    },
)
heading_font_size = base_font_size + 5

In [None]:
wc_generator = WordCloud(
    font_path=font_manager.FontManager().findfont(fontname),
    mode="RGBA",
    background_color=None,
    colormap=cm.get_cmap("brg"),
)

In [None]:
orders = {
    None: None,
    "familiarity": [
        "Very familiar",
        "Somewhat familiar",
        "Not very familiar",
        "Not at all familiar",
    ],
    "binary": ["Yes", "No"],
    "defined": ["None", "Some", "All"],
    "automation": [
        "Each analysis step is run by hand; files must be manually moved or adjusted between steps",
        "Each analysis is run by hand, reading in files from previous steps",
        "Some groups of steps run automatically, with some manual intervention in between",
        "All analysis steps run automatically, but some parameters must be provided "
        "for the full pipeline to complete",
        "All analysis steps run automatically, and can determine optimal parameters "
        "without the need for input beyond the metadata describing the input files",
    ],
    "notpublish": [
        "Concerned that code has bugs",
        "Not sure where to publish code",
        "Code is not reusable in other contexts",
        "Not clear what code should be published",
        "Concerned about code quality not being good",
        "Don’t want others to “scoop” my future work using my own code",
        "Full workflow is not automated, so it would not help reproducibility",
    ],
    "freq": [
        "Never",
        "Once or twice, ever",
        "Once per year or less",
        "Once per month or less",
        "Multiple times per month",
    ],
    "close": [
        "Not at all close",
        "Not very close",
        "Heading in the right direction",
        "Very close",
        "Exactly there",
    ],
    "delay": [
        "Less than 1 day",
        "1 day–1 week",
        "1 week–1 month",
        "More than 1 month",
        "I did not send the data",
    ],
}

rots = {
    None: False,
    "familiarity": True,
    "binary": False,
    "defined": False,
    "automation": True,
    "notpublish": True,
    "freq": True,
    "close": True,
    "delay": True,
}

monotonic_scale = defaultdict(lambda: False)
monotonic_scale.update(
    {
        "familiarity": True,
        "defined": True,
        "automation": True,
        "freq": True,
        "close": True,
        "delay": True,
    }
)


def strip(responses):
    return [re.sub(r"( ?\(.*\)| –.*)", "", q) for q in responses if isinstance(q, str)]


def format_ticklabel(text):
    return re.sub("{.*other.*}", "Something else", "\n".join(textwrap.wrap(text, 50)))


def histogram_singlechoice(
    questions, question_id, /, rot=None, ax=None, title=None, order=None
):
    single_question = questions.T.query(f"question_id=={question_id}").T

    if title is None:
        question_text = single_question.columns.get_level_values("title").values[0]
        title = re.match(r"[^?]+\?", question_text).group(0)

    if not ax:
        fig, ax = plt.subplots()  # constrained_layout=True)
        owned_fig = fig
    else:
        fig = ax.figure
        owned_fig = None

    if monotonic_scale[order]:
        palette = sns.color_palette("rocket", n_colors=len(orders[order]))
    else:
        palette = None

    sns.countplot(
        x=strip(questions.T.query(f"question_id=={question_id}").T.values[:, 0]),
        ax=ax,
        order=orders[order],
        palette=palette,
    )
    ax.set_title(title)

    if rot is None:
        rot = rots[order]

    if rot:
        wrapped_ticklabels = []
        for ticklabel in ax.get_xticklabels():
            ticklabel.set_rotation(45)
            ticklabel.set_ha("right")
            wrapped_ticklabels.append(format_ticklabel(ticklabel.get_text()))
        ax.set_xticklabels(wrapped_ticklabels)

    if owned_fig:
        fig.tight_layout(pad=0.03)

    return fig, ax


def strip_multichoice(questions, question_id):
    return (
        questions.T.query(f"question_id=={question_id}")
        .groupby("answer")
        .apply(
            lambda df: df.replace(
                "Yes", df.index.get_level_values("answer").values.squeeze()
            ).replace("No", None)
        )
        .unstack(["id", "answer_id", "answer"])
        .T.droplevel(["id", "answer_id", "answer"])
        .dropna()
    )


def histogram_multichoice(questions, question_id, **kwargs):
    histogram_singlechoice(
        strip_multichoice(questions, question_id), question_id, **kwargs
    )


def figlegend(fig, ax, top, bottom, wrap_threshold=100, loc="lower center"):
    rects = [bar for bar in ax.patches]
    ticklabels = [
        "\n".join(textwrap.wrap(label.get_text().replace("\n", " "), wrap_threshold))
        for label in ax.get_xticklabels()
    ]
    fig.legend(rects, ticklabels, loc=loc, frameon=False)

    for axis in fig.axes:
        axis.set_xticklabels([])

    fig.subplots_adjust(top=top, bottom=bottom)

In [None]:
def savefig(fig, filename_base):
    fig.savefig(f"{plot_dir}/{filename_base}.{filetype}")


def save_cloud(wordcloud, filename_base):
    if filetype == "svg":
        with open(f"{plot_dir}/{filename_base}.svg", "w") as f:
            f.write(wordcloud.to_svg(embed_font=True))
    else:
        wordcloud.to_file(f"{plot_dir}/{filename_base}.{filetype}")

In [None]:
fig, ax = histogram_singlechoice(questions, 97, order="familiarity")

savefig(fig, "open_science_familiarity")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(questions, 98, order="familiarity")

savefig(fig, "fair_data_familiarity")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(questions, 99, order="binary")

savefig(fig, "open_science_participation")
plt.show()

In [None]:
Counter(questions["G19Q100"].values[:, 0])

In [None]:
fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(8, 4), constrained_layout=True)

fig.suptitle("Does your organisation have an institutional repository…").set_fontsize(
    heading_font_size
)

histogram_singlechoice(
    questions, 101, order="binary", ax=ax[0], title="For publications?"
)
histogram_singlechoice(questions, 102, order="binary", ax=ax[1], title="For data?")

ax[1].set_ylabel("")

savefig(fig, "institutional_repositories")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(questions, 103, order="familiarity")

savefig(fig, "eosc_familiarity")
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=3, sharey=True, figsize=(10, 4))

fig.suptitle("How familiar are you with…").set_fontsize(heading_font_size)

histogram_singlechoice(
    questions, 105, ax=ax[0], title="The concept of metadata?", order="familiarity"
)
histogram_singlechoice(
    questions, 106, ax=ax[1], title="Metadata schemas for lattice?", order="familiarity"
)
histogram_singlechoice(
    questions, 107, ax=ax[2], title="The concept of ontologies?", order="familiarity"
)

ax[1].set_ylabel("")
ax[2].set_ylabel("")

fig.tight_layout(pad=0.3, w_pad=-1)

savefig(fig, "metadata_familiarity")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(questions, 108, order="binary")

savefig(fig, "data_management_plan")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(questions, 109, order="familiarity")

savefig(fig, "persistent_identifier_familiarity")
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=3, sharey=True, figsize=(10, 4))

fig.suptitle("Are your data formats defined or documented?").set_fontsize(
    heading_font_size
)

histogram_singlechoice(
    questions, 40, ax=ax[0], title="For input to analysis", order="defined"
)
histogram_singlechoice(
    questions, 46, ax=ax[1], title="For output of analysis", order="defined"
)
histogram_singlechoice(
    questions, 23, ax=ax[2], title="For plotting/tabulating", order="defined"
)


ax[1].set_ylabel("")
ax[2].set_ylabel("")

fig.tight_layout(pad=0.3)

savefig(fig, "data_format_documentation")
plt.show()

In [None]:
with open("10000.txt") as f:
    common_words = f.read().split("\n")

common_words.extend(["e.g.", "etc.", "https", "lattice", "qcd", "github.com"])
common_words.remove("python")


def condense_filter_words(question_id):
    filtered_words = []
    for response in questions.T.query(f"question_id=={question_id}").T.values[:, 0]:
        if isinstance(response, str):
            words = re.split("[\n, /():]+", response)
            for word in words:
                if word.lower() not in common_words:
                    filtered_words.append(word)

    return filtered_words

In [None]:
lattice_tools = wc_generator.generate(" ".join(condense_filter_words(34)))

save_cloud(lattice_tools, "lattice_tool_cloud")

plt.imshow(lattice_tools, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
nonlattice_tools = wc_generator.generate(" ".join(condense_filter_words(35)))

save_cloud(nonlattice_tools, "nonlattice_tool_cloud")

plt.imshow(nonlattice_tools, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
histogram_singlechoice(questions, 53, order="automation", ax=ax)

fig.tight_layout(pad=0.03)
fig.subplots_adjust(left=0.08)
figlegend(fig, ax, 0.92, 0.4, wrap_threshold=80, loc="lower left")
savefig(fig, "how_automated")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(
    questions,
    94,
    order="freq",
    title="How often would your research benefit from access to others’ data?",
)

savefig(fig, "benefit_from_others_data_frequency")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(
    questions,
    83,
    order="close",
    title="How close is your data analysis/presentation workflow to where you want it to be?",
)

savefig(fig, "distance_from_ideal")
plt.show()

In [None]:
fig = plt.figure(constrained_layout=False, figsize=(8, 6))
subfigs = fig.subfigures(nrows=5, ncols=1, height_ratios=(1, 6, 0.5, 0.5, 12))

ax = [
    subfigs[1].subplots(nrows=1, ncols=2, sharey=True),
    subfigs[4].subplots(nrows=1, ncols=2, sharey=True),
]

subfigs[0].suptitle(
    'You have made data/code available "on request". Have you had requests?'
).set_fontsize(heading_font_size)
subfigs[3].suptitle(
    "How long did it take you to respond to these requests?"
).set_fontsize(heading_font_size)

histogram_singlechoice(questions, 68, ax=ax[0][0], title="Data", order="binary")
histogram_singlechoice(questions, 77, ax=ax[0][1], title="Code", order="binary")

histogram_multichoice(questions, 69, ax=ax[1][0], title="Data", order="delay")
histogram_multichoice(questions, 78, ax=ax[1][1], title="Code", order="delay")

for row in ax:
    row[1].set_ylabel("")

# plt.tight_layout(pad=4, rect=(0, 0.4, 1, 1))
subfigs[4].subplots_adjust(bottom=0.45)
savefig(fig, "on_request")
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(8, 4))

fig.suptitle(
    "Have you made available the code used for data analysis/presentation…"
).set_fontsize(heading_font_size)

histogram_singlechoice(
    questions, 71, ax=ax[0], title="For most recent publication?", order="binary"
)
histogram_singlechoice(questions, 72, ax=ax[1], title="For other work?", order="binary")

ax[1].set_ylabel("")

fig.tight_layout(pad=0.3)

savefig(fig, "made_code_available")
plt.show()

In [None]:
# TODO: Can we incorporate "other" here?

fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(8, 5))

fig.suptitle("Why have you not made code available…").set_fontsize(heading_font_size)

histogram_multichoice(
    questions, 73, ax=ax[0], title="For most recent publication?", order="notpublish"
)
histogram_multichoice(questions, 74, ax=ax[1], title="In general?", order="notpublish")

ax[1].set_ylabel("")
figlegend(fig, ax[0], 0.85, 0.4)

savefig(fig, "why_not_made_code_available")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(
    questions,
    82,
    title="All else being equal, do you prefer text-baased or graphical tools?",
)

savefig(fig, "text_or_graphical")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(
    questions,
    85,
    title="If you had an automated workflow, would you consider\npublishing it concurrently with the corresponding paper?",
    order="binary",
)

savefig(fig, "consider_publishing")
plt.show()

In [None]:
# TODO: Find a way to plot the non-binary ratings?
# TODO: Find a way to include the free text entered for "other"

fig, ax = plt.subplots(figsize=(9.6, 6))
histogram_singlechoice(
    questions,
    90,
    title="Which is the most important to enable reproducibility of a publication?",
    rot=True,
    ax=ax,
)

figlegend(fig, ax, 0.92, 0.36)

savefig(fig, "most_important_for_reproducibility")
plt.show()

In [None]:
fig, ax = histogram_singlechoice(
    questions,
    111,
    title="Would Open Science concepts be useful to your research?",
    order="binary",
)

savefig(fig, "open_science_useful")
plt.show()

In [None]:
data_publishing_services = wc_generator.generate(" ".join(condense_filter_words(67)))

save_cloud(data_publishing_services, "data_publishing_services_cloud")

plt.imshow(data_publishing_services, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
code_publishing_services = wc_generator.generate(" ".join(condense_filter_words(76)))

save_cloud(code_publishing_services, "code_publishing_services_cloud")

plt.imshow(code_publishing_services, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
incentives = wc_generator.generate(" ".join(condense_filter_words(110)))

save_cloud(incentives, "open_science_incentives_cloud")

plt.imshow(incentives, interpolation="bilinear")
plt.axis("off")
plt.show()