In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "./survey_data.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Participant Number,Modality,Field of Study,Experience with LLMs,Date Interviewed,Q1 Starting Choice,Q1 Starting Confidence,Q1 Final Choice,Q1 Final Confidence,Q1 Flipped Choice?,...,Q2 Final Confidence,Q2 Flipped Choice?,Q2 Delta,Q3 Starting Choice,Q3 Starting Confidence,Q3 Final Choice,Q3 Final Confidence,Q3 Flipped Choice?,Q3 Delta,Clean Data?
0,P1,Text,CS,4.0,11/21/2024,Never wait in line,5.0,Never wait in line,5.0,False,...,5.0,True,0.0,Pause,3.0,Pause,3.0,False,0.0,False
1,P2,Text,CS,4.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,4.0,True,-1.0,Rewind,5.0,Rewind,5.0,False,0.0,True
2,P3,Audio,CS,5.0,11/21/2024,Never wait in line,4.0,Never do chores,3.0,True,...,2.0,True,0.0,Pause,3.0,Pause,2.0,False,1.0,True
3,P4,Audio,CS,5.0,11/21/2024,Never do chores,4.0,Never do chores,3.0,False,...,4.0,True,0.0,Pause,4.0,Pause,4.0,False,0.0,True
4,P5,Text,CS,5.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,5.0,False,0.0,Rewind,5.0,Pause,2.0,True,3.0,True


In [18]:
# Filter rows where 'Clean Data?' is TRUE
clean_data = data[data['Clean Data?'] == True].copy()

# Calculate the number of flipped choices per participant
clean_data['Total Flipped Choices'] = (
    clean_data['Q1 Flipped Choice?'].astype(int) +
    clean_data['Q2 Flipped Choice?'].astype(int) +
    clean_data['Q3 Flipped Choice?'].astype(int)
)
clean_data.head()

Unnamed: 0,Participant Number,Modality,Field of Study,Experience with LLMs,Date Interviewed,Q1 Starting Choice,Q1 Starting Confidence,Q1 Final Choice,Q1 Final Confidence,Q1 Flipped Choice?,...,Q2 Flipped Choice?,Q2 Delta,Q3 Starting Choice,Q3 Starting Confidence,Q3 Final Choice,Q3 Final Confidence,Q3 Flipped Choice?,Q3 Delta,Clean Data?,Total Flipped Choices
1,P2,Text,CS,4.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,True,-1.0,Rewind,5.0,Rewind,5.0,False,0.0,True,1
2,P3,Audio,CS,5.0,11/21/2024,Never wait in line,4.0,Never do chores,3.0,True,...,True,0.0,Pause,3.0,Pause,2.0,False,1.0,True,2
3,P4,Audio,CS,5.0,11/21/2024,Never do chores,4.0,Never do chores,3.0,False,...,True,0.0,Pause,4.0,Pause,4.0,False,0.0,True,1
4,P5,Text,CS,5.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,False,0.0,Rewind,5.0,Pause,2.0,True,3.0,True,1
5,P6,Audio,"Econ, CS",3.0,11/21/2024,Never do chores,3.5,Never do chores,3.0,False,...,False,0.0,Rewind,5.0,Rewind,4.0,False,1.0,True,0


In [19]:
# Split by Modality
flipped_counts = clean_data.groupby('Modality')['Total Flipped Choices'].sum()
# Average flipped choices per participant by modality
average_flipped = clean_data.groupby('Modality')['Total Flipped Choices'].mean()
# Clean participants per modality
modality_counts = clean_data['Modality'].value_counts()

# Print the analysis
print("Total Participants Per Modality:")
print(modality_counts)
print("\nTotal Flipped Choices by Modality:")
print(flipped_counts)
print("\nAverage Flipped Choices by Modality:")
print(average_flipped)

Total Participants Per Modality:
Modality
Audio    19
Text     18
Name: count, dtype: int64

Total Flipped Choices by Modality:
Modality
Audio    13
Text     13
Name: Total Flipped Choices, dtype: int32

Average Flipped Choices by Modality:
Modality
Audio    0.684211
Text     0.722222
Name: Total Flipped Choices, dtype: float64


In [20]:
# Define a function to check if starting and final choices are the same but confidence decreased
def confidence_decrease(row, question_prefix):
    return (
        row[f'{question_prefix} Starting Choice'] == row[f'{question_prefix} Final Choice'] and
        row[f'{question_prefix} Starting Confidence'] > row[f'{question_prefix} Final Confidence']
    )

# Create a column to count such instances for each participant
for question_prefix in ['Q1', 'Q2', 'Q3']:
    clean_data[f'{question_prefix} Confidence Decrease'] = clean_data.apply(
        lambda row: confidence_decrease(row, question_prefix), axis=1
    )

# Total confidence decreases per participant
clean_data['Total Confidence Decreases'] = (
    clean_data['Q1 Confidence Decrease'].astype(int) +
    clean_data['Q2 Confidence Decrease'].astype(int) +
    clean_data['Q3 Confidence Decrease'].astype(int)
)

# Group by Modality to count the total instances where confidence decreased
confidence_decrease_counts = clean_data.groupby('Modality')['Total Confidence Decreases'].sum()

# Display the counts
print("Confidence Decreases by Modality:")
print(confidence_decrease_counts)

Confidence Decreases by Modality:
Modality
Audio    11
Text      0
Name: Total Confidence Decreases, dtype: int32


In [21]:
# Define a function to check if starting and final choices are the same but confidence increased
def confidence_increase(row, question_prefix):
    return (
        row[f'{question_prefix} Starting Choice'] == row[f'{question_prefix} Final Choice'] and
        row[f'{question_prefix} Starting Confidence'] < row[f'{question_prefix} Final Confidence']
    )

# Create a column to count such instances for each participant
for question_prefix in ['Q1', 'Q2', 'Q3']:
    clean_data[f'{question_prefix} Confidence Increase'] = clean_data.apply(
        lambda row: confidence_increase(row, question_prefix), axis=1
    )

# Total confidence increases per participant
clean_data['Total Confidence Increases'] = (
    clean_data['Q1 Confidence Increase'].astype(int) +
    clean_data['Q2 Confidence Increase'].astype(int) +
    clean_data['Q3 Confidence Increase'].astype(int)
)

# Group by Modality to count the total instances where confidence increased
confidence_increase_counts = clean_data.groupby('Modality')['Total Confidence Increases'].sum()

# Display the counts
print("Confidence Increases by Modality:")
print(confidence_increase_counts)

Confidence Increases by Modality:
Modality
Audio    9
Text     8
Name: Total Confidence Increases, dtype: int32
