# Data Import

In [1]:
import sys

# Check and import necessary modules
if 'pandas' not in sys.modules: import pandas as pd
if 'numpy' not in sys.modules: import numpy as np
if 'sklearn.metrics' not in sys.modules: from sklearn.metrics import cohen_kappa_score
if 'scipy.stats' not in sys.modules: from scipy.stats import spearmanr
if 'pingouin' not in sys.modules: import pingouin as pg
if 'matplotlib.pyplot' not in sys.modules: import matplotlib.pyplot as plt


In [2]:
# read in raw qualtrics data
data_raw = pd.read_csv('Inputs/Qualtrics_Export.csv').drop([0, 1]).reset_index(drop=True)

drop_columns = ['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'Q_AmbiguousTextPresent', 'Q_AmbiguousTextQuestions', 'Q_StraightliningCount', 'Q_StraightliningPercentage', 'Q_StraightliningQuestions', 'Q_UnansweredPercentage', 'Q_UnansweredQuestions']
data_raw = data_raw.drop(columns=drop_columns)
data_raw

Unnamed: 0,RecordedDate,ResponseId,Q_RecaptchaScore,Q2.1,Q3.2_1,Q3.2_2,Q3.2_3,Q3.2_4,Q3.2_5,Q3.2_6,...,Q23.2_3,Q23.2_4,Q23.2_5,Q23.2_6,Q23.3,Q23.4,Q23.5_First Click,Q23.5_Last Click,Q23.5_Page Submit,Q23.5_Click Count
0,2024-05-20 08:49:35,R_8iIOf9RamJjjsQ2,0.8999999761581421,1,4.0,4.0,5.0,2.0,1.0,5.0,...,,,,,,,,,,
1,2024-05-20 15:18:01,R_2gJU8yVqE6b0gSZ,1.0,1,4.0,4.0,4.0,1.0,4.0,4.0,...,,,,,,,,,,
2,2024-05-21 15:15:24,R_8VlGmI5zgDHNcVv,1.0,1,4.0,5.0,4.0,4.0,1.0,1.0,...,,,,,,,,,,
3,2024-05-21 18:27:48,R_2SshTS8mO4ohduh,0.8999999761581421,1,,,,,,,...,,,,,,,,,,
4,2024-05-21 22:19:28,R_26fQGM8FSWoTkjf,1.0,2,4.0,5.0,4.0,5.0,3.0,2.0,...,,,,,,,,,,
5,2024-05-22 08:22:31,R_2Kr3pC2Sebds7bf,0.8999999761581421,1,,,,,,,...,,,,,,,,,,
6,2024-05-22 10:16:26,R_82t2e2GNjak8tgS,0.8999999761581421,1,,,,,,,...,,,,,,,,,,
7,2024-05-22 15:40:13,R_8e4ByNZ27bZIG4P,0.8999999761581421,1,5.0,5.0,5.0,1.0,2.0,1.0,...,1.0,1.0,1.0,3.0,1.0,No fluff,0.751,76.081,77.06,24.0
8,2024-05-23 13:36:24,R_82QBjMMg8zLNxx0,0.8999999761581421,2,,,,,,,...,4.0,5.0,4.0,1.0,4.0,,44.141,71.31,72.903,7.0
9,2024-05-23 13:47:48,R_87lJaEG9vxtRCZ5,1.0,2,,,,,,,...,,,,,,,,,,


# Fluff Score Evaluation

In [3]:
# Extract all Fluff Scores, use text numbers as column names and differentiate between student and expert scores
fluff_scores_human = data_raw.filter(regex='\.3$|Q2.1') # extract all third Subquestions which are the human fluff scores
question_nr = [int(x[1:-2]) for x in list(fluff_scores_human.columns[1:])] # extract the question block numbers
text_nr = ['T' + str(x-2) for x in question_nr] # create the corresponding text numbers
fluff_scores_human.columns = ['Expert'] + text_nr # rename columns
fluff_scores_human = fluff_scores_human.apply(pd.to_numeric, errors='coerce') # convert to numeric
fluff_scores_human['Expert'] = fluff_scores_human['Expert'].replace({1: False, 2: True}) # convert to boolean
fluff_scores_human

  fluff_scores_human['Expert'] = fluff_scores_human['Expert'].replace({1: False, 2: True}) # convert to boolean


Unnamed: 0,Expert,T1,T2,T3,T4,T5,T6,T7,T8,T9,...,T12,T13,T14,T15,T16,T17,T18,T19,T20,T21
0,False,4.0,4.0,3.0,4.0,2.0,,,,,...,,,,,,,,,,
1,False,4.0,3.0,3.0,4.0,4.0,3.0,2.0,1.0,1.0,...,,,,,,,,,,
2,False,4.0,2.0,3.0,3.0,4.0,1.0,5.0,1.0,1.0,...,,,,,,,,,,
3,False,,,,,,,,,,...,4.0,2.0,2.0,2.0,,,,,,
4,True,4.0,,,,4.0,,,,1.0,...,,,,,5.0,1.0,,,,
5,False,,,,,,,,,2.0,...,,,,,5.0,3.0,2.0,5.0,3.0,
6,False,,,,,3.0,2.0,,,2.0,...,1.0,,,,,,,4.0,3.0,
7,False,4.0,,,,,,,,,...,,1.0,,2.0,4.0,,,,,1.0
8,True,,4.0,,,,,,,,...,1.0,,,2.0,,,2.0,5.0,,4.0
9,True,,,,3.0,,1.0,2.0,,,...,,,1.0,,,,,,3.0,


In [4]:
# Fluff Scores Summary Statistics
fluff_scores_summary_students = fluff_scores_human.where(fluff_scores_human['Expert'] == False).describe().transpose()[['mean', 'std', 'count']].round(2)
fluff_scores_summary_experts = fluff_scores_human.where(fluff_scores_human['Expert'] == True).describe().transpose()[['mean', 'std', 'count']].round(2)
fluff_scores_summary_humans = fluff_scores_human.drop(columns=['Expert']).describe().transpose()[['mean', 'std', 'count']].round(2)

# kappa_s_e = cohen_kappa_score(fluff_scores_summary_students['mean'], fluff_scores_summary_experts['mean'])
# print(f"Cohen's Kappa: {kappa_s_e}")