### Import and Load Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "./survey_data.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Participant Number,Modality,Field of Study,Experience with LLMs,Date Interviewed,Q1 Starting Choice,Q1 Starting Confidence,Q1 Final Choice,Q1 Final Confidence,Q1 Flipped Choice?,...,Q2 Starting Confidence,Q2 Final Choice,Q2 Final Confidence,Q2 Flipped Choice?,Q3 Starting Choice,Q3 Starting Confidence,Q3 Final Choice,Q3 Final Confidence,Q3 Flipped Choice?,Clean Data?
0,P1,Text,CS,4.0,11/21/2024,Never wait in line,5.0,Never wait in line,5.0,False,...,5.0,Million today,5.0,True,Pause,3.0,Pause,3.0,False,False
1,P2,Text,CS,4.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,3.0,Million today,4.0,True,Rewind,5.0,Rewind,5.0,False,True
2,P3,Audio,CS,5.0,11/21/2024,Never wait in line,4.0,Never do chores,3.0,True,...,2.0,Million today,2.0,True,Pause,3.0,Pause,2.0,False,True
3,P4,Audio,CS,5.0,11/21/2024,Never do chores,4.0,Never do chores,3.0,False,...,4.0,Million today,4.0,True,Pause,4.0,Pause,4.0,False,True
4,P5,Text,CS,5.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,5.0,Million today,5.0,False,Rewind,5.0,Pause,2.0,True,True


### Clean Data

In [2]:
# Filter rows where 'Clean Data?' is TRUE
clean_data = data[data['Clean Data?'] == True].copy()

# Calculate the number of flipped choices per participant
clean_data['Total Flipped Choices'] = (
    clean_data['Q1 Flipped Choice?'].astype(int) +
    clean_data['Q2 Flipped Choice?'].astype(int) +
    clean_data['Q3 Flipped Choice?'].astype(int)
)
clean_data.head()

Unnamed: 0,Participant Number,Modality,Field of Study,Experience with LLMs,Date Interviewed,Q1 Starting Choice,Q1 Starting Confidence,Q1 Final Choice,Q1 Final Confidence,Q1 Flipped Choice?,...,Q2 Final Choice,Q2 Final Confidence,Q2 Flipped Choice?,Q3 Starting Choice,Q3 Starting Confidence,Q3 Final Choice,Q3 Final Confidence,Q3 Flipped Choice?,Clean Data?,Total Flipped Choices
1,P2,Text,CS,4.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,Million today,4.0,True,Rewind,5.0,Rewind,5.0,False,True,1
2,P3,Audio,CS,5.0,11/21/2024,Never wait in line,4.0,Never do chores,3.0,True,...,Million today,2.0,True,Pause,3.0,Pause,2.0,False,True,2
3,P4,Audio,CS,5.0,11/21/2024,Never do chores,4.0,Never do chores,3.0,False,...,Million today,4.0,True,Pause,4.0,Pause,4.0,False,True,1
4,P5,Text,CS,5.0,11/21/2024,Never wait in line,4.0,Never wait in line,4.0,False,...,Million today,5.0,False,Rewind,5.0,Pause,2.0,True,True,1
5,P6,Audio,"Econ, CS",3.0,11/21/2024,Never do chores,3.5,Never do chores,3.0,False,...,"100,000 ten years",4.0,False,Rewind,5.0,Rewind,4.0,False,True,0


### Total Flipped Choices

In [3]:
# Split by Modality
flipped_counts = clean_data.groupby('Modality')['Total Flipped Choices'].sum()

# Average flipped choices per participant by modality
average_flipped = clean_data.groupby('Modality')['Total Flipped Choices'].mean()

# Total number of participants by modality
modality_counts = clean_data['Modality'].value_counts()

# Total participants who flipped at least one choice by modality
participants_with_flips = clean_data[clean_data['Total Flipped Choices'] > 0].groupby('Modality').size()

# Print the analysis
print("Total Participants Per Modality:")
print(modality_counts)
print("\nTotal Participants Who Flipped at Least One Choice by Modality:")
print(participants_with_flips)
print("\nTotal Flipped Choices by Modality:")
print(flipped_counts)
print("\nAverage Flipped Choices by Modality:")
print(average_flipped)

Total Participants Per Modality:
Modality
Text     19
Audio    19
Name: count, dtype: int64

Total Participants Who Flipped at Least One Choice by Modality:
Modality
Audio    11
Text     11
dtype: int64

Total Flipped Choices by Modality:
Modality
Audio    13
Text     13
Name: Total Flipped Choices, dtype: int32

Average Flipped Choices by Modality:
Modality
Audio    0.684211
Text     0.684211
Name: Total Flipped Choices, dtype: float64


### Flips Who Started With Low Confidence

In [4]:
# Calculate total participants who started with low confidence for any question
def started_with_low_confidence(row):
    return (
        row['Q1 Starting Confidence'] <= 3 or
        row['Q2 Starting Confidence'] <= 3 or
        row['Q3 Starting Confidence'] <= 3
    )

clean_data['Started with Low Confidence'] = clean_data.apply(started_with_low_confidence, axis=1)
participants_with_low_confidence = (
    clean_data[clean_data['Started with Low Confidence']]
    .groupby('Modality')
    .size()
)

# Define a function to check if a participant started with low confidence and flipped any choice
def started_low_confidence_and_flipped(row):
    return (
        (row['Q1 Starting Confidence'] <= 3 and row['Q1 Flipped Choice?']) or
        (row['Q2 Starting Confidence'] <= 3 and row['Q2 Flipped Choice?']) or
        (row['Q3 Starting Confidence'] <= 3 and row['Q3 Flipped Choice?'])
    )

# Create a column to flag such participants
clean_data['Started Low Confidence and Flipped'] = clean_data.apply(
    started_low_confidence_and_flipped, axis=1
)

# Count participants who started with low confidence and flipped any choice per modality
participants_low_confidence_and_flipped = (
    clean_data[clean_data['Started Low Confidence and Flipped']]
    .groupby('Modality')
    .size()
)

# Define a function to check if a choice was flipped and starting confidence was <= 3
def flipped_with_low_confidence(row, question_prefix):
    return (
        row[f'{question_prefix} Starting Confidence'] <= 3 and
        row[f'{question_prefix} Flipped Choice?']
    )

# Create a column to count such instances for each participant
for question_prefix in ['Q1', 'Q2', 'Q3']:
    clean_data[f'{question_prefix} Low Confidence Flip'] = clean_data.apply(
        lambda row: flipped_with_low_confidence(row, question_prefix), axis=1
    )

# Total low confidence flips per participant
clean_data['Total Low Confidence Flips'] = (
    clean_data['Q1 Low Confidence Flip'].astype(int) +
    clean_data['Q2 Low Confidence Flip'].astype(int) +
    clean_data['Q3 Low Confidence Flip'].astype(int)
)

# Group by Modality to count the total instances of low confidence flips
low_confidence_flip_counts = clean_data.groupby('Modality')['Total Low Confidence Flips'].sum()

# Calculate the percentage of low confidence participants who flipped any choice
percentage_flipped = (
    participants_low_confidence_and_flipped / participants_with_low_confidence
)

# Display the results
print("\nTotal Participants Who Started with Low Confidence by Modality:")
print(participants_with_low_confidence)
print("\nTotal Low Confidence Participants Who Flipped any Choice:")
print(participants_low_confidence_and_flipped)
print("\nTotal Low Confidence Flipped Choices by Modality:")
print(low_confidence_flip_counts)
print("\nPercentage of Low Confidence Participants Who Flipped Any Choice by Modality:")
print(percentage_flipped)


Total Participants Who Started with Low Confidence by Modality:
Modality
Audio    11
Text      8
dtype: int64

Total Low Confidence Participants Who Flipped any Choice:
Modality
Audio    7
Text     4
dtype: int64

Total Low Confidence Flipped Choices by Modality:
Modality
Audio    7
Text     4
Name: Total Low Confidence Flips, dtype: int32

Percentage of Low Confidence Participants Who Flipped Any Choice by Modality:
Modality
Audio    0.636364
Text     0.500000
dtype: float64


### Flips for CS vs non-CS Students

In [5]:
# Define a function to check if 'CS' is in the 'Field of Study' (allowing multiple fields)
def is_cs(field_of_study):
    # Split the field of study by ", " and check if "CS" is present
    return 'CS' in field_of_study.split(', ')

# Apply this function to filter the CS and non-CS participants
clean_data['Is_CS'] = clean_data['Field of Study'].apply(is_cs)

# Calculate the number of flipped choices for each participant
def calculate_flipped_choices(row, question_prefix):
    return (
        row[f'{question_prefix} Flipped Choice?'].astype(bool).astype(int)
    )

# Apply flipped choices calculation for each question (Q1, Q2, Q3)
clean_data['Total Flipped Choices'] = (
    clean_data['Q1 Flipped Choice?'].astype(bool).astype(int) +
    clean_data['Q2 Flipped Choice?'].astype(bool).astype(int) +
    clean_data['Q3 Flipped Choice?'].astype(bool).astype(int)
)

# Group by both 'Field of Study' and 'Modality' to calculate the total flipped choices
flipped_choices_by_group = clean_data.groupby(['Is_CS', 'Modality'])['Total Flipped Choices'].sum()

# Print the results
print("Total Flipped Choices by Field of Study and Modality:")
print(flipped_choices_by_group)

Total Flipped Choices by Field of Study and Modality:
Is_CS  Modality
False  Audio       7
       Text        7
True   Audio       6
       Text        6
Name: Total Flipped Choices, dtype: int32


### Flips Based on LLM Experience Level

In [6]:
# Create a new column that groups participants into 'Less Experienced' (1-3) and 'More Experienced' (4-5)
clean_data['Experience Group'] = clean_data['Experience with LLMs'].apply(
    lambda x: 'Less Experienced' if x >= 1 and x <= 3 else 'More Experienced'
)

# Calculate the number of flipped choices for each participant (same as before)
clean_data['Total Flipped Choices'] = (
    clean_data['Q1 Flipped Choice?'].astype(bool).astype(int) +
    clean_data['Q2 Flipped Choice?'].astype(bool).astype(int) +
    clean_data['Q3 Flipped Choice?'].astype(bool).astype(int)
)

# Group by the new 'Experience Group' and calculate the sum of flipped choices
flipped_choices_by_experience_group = clean_data.groupby('Experience Group')['Total Flipped Choices'].sum()

# Group by the new 'Experience Group' and calculate the count of participants
participant_count_by_experience_group = clean_data.groupby('Experience Group')['Total Flipped Choices'].count()

# Normalize by dividing the sum of flipped choices by the count of participants
normalized_flipped_choices_group = flipped_choices_by_experience_group / participant_count_by_experience_group

# Combine all the information into a single DataFrame: total flipped choices, number of participants, and normalized flips
flipped_choices_summary_group = pd.DataFrame({
    'Number of Participants': participant_count_by_experience_group,
    'Total Flipped Choices': flipped_choices_by_experience_group,
    'Normalized Flipped Choices': normalized_flipped_choices_group
})

# Print the summary for the two experience groups
print("Summary of Total Flipped Choices, Normalized Flipped Choices, and Number of Participants by Experience Group:")
print(flipped_choices_summary_group)

Summary of Total Flipped Choices, Normalized Flipped Choices, and Number of Participants by Experience Group:
                  Number of Participants  Total Flipped Choices  \
Experience Group                                                  
Less Experienced                      14                      7   
More Experienced                      24                     19   

                  Normalized Flipped Choices  
Experience Group                              
Less Experienced                    0.500000  
More Experienced                    0.791667  


### Flips by Question

In [7]:
# Calculate flips for each question (1 if flipped, 0 if not)
clean_data['Q1 Flipped'] = clean_data['Q1 Flipped Choice?'].astype(bool).astype(int)
clean_data['Q2 Flipped'] = clean_data['Q2 Flipped Choice?'].astype(bool).astype(int)
clean_data['Q3 Flipped'] = clean_data['Q3 Flipped Choice?'].astype(bool).astype(int)

# 1. Calculate total flips per question (summing flipped choices across all participants)
total_flips_per_question = clean_data[['Q1 Flipped', 'Q2 Flipped', 'Q3 Flipped']].sum()

# 2. Calculate total flips by modality per question
flips_by_modality_per_question = clean_data.groupby('Modality')[['Q1 Flipped', 'Q2 Flipped', 'Q3 Flipped']].sum()

# Print the total flips for each question
print("Total Flips Per Question:")
print(total_flips_per_question)

# Print the flips by modality for each question
print("\nFlips by Modality Per Question:")
print(flips_by_modality_per_question)

Total Flips Per Question:
Q1 Flipped     7
Q2 Flipped    12
Q3 Flipped     7
dtype: int64

Flips by Modality Per Question:
          Q1 Flipped  Q2 Flipped  Q3 Flipped
Modality                                    
Audio              4           7           2
Text               3           5           5


### Decreases in Confidence

In [8]:
# Define a function to check if starting and final choices are the same but confidence decreased
def confidence_decrease(row, question_prefix):
    return (
        row[f'{question_prefix} Starting Choice'] == row[f'{question_prefix} Final Choice'] and
        row[f'{question_prefix} Starting Confidence'] > row[f'{question_prefix} Final Confidence']
    )

# Create a column to count such instances for each participant
for question_prefix in ['Q1', 'Q2', 'Q3']:
    clean_data[f'{question_prefix} Confidence Decrease'] = clean_data.apply(
        lambda row: confidence_decrease(row, question_prefix), axis=1
    )

# Total confidence decreases per participant
clean_data['Total Confidence Decreases'] = (
    clean_data['Q1 Confidence Decrease'].astype(int) +
    clean_data['Q2 Confidence Decrease'].astype(int) +
    clean_data['Q3 Confidence Decrease'].astype(int)
)

# Group by Modality to count the total instances where confidence decreased
confidence_decrease_counts = clean_data.groupby('Modality')['Total Confidence Decreases'].sum()

# Count the total number of participants per modality who had any confidence decrease
participants_with_confidence_decrease = clean_data[clean_data['Total Confidence Decreases'] > 0].groupby('Modality').size()

# Display the counts
print("Confidence Decreases by Modality (Total Instances):")
print(confidence_decrease_counts)
print("\nTotal Participants Who Decreased Confidence by Modality:")
print(participants_with_confidence_decrease)

Confidence Decreases by Modality (Total Instances):
Modality
Audio    11
Text      0
Name: Total Confidence Decreases, dtype: int32

Total Participants Who Decreased Confidence by Modality:
Modality
Audio    10
dtype: int64


### Increases in Confidence

In [9]:
# Define a function to check if starting and final choices are the same but confidence increased
def confidence_increase(row, question_prefix):
    return (
        row[f'{question_prefix} Starting Choice'] == row[f'{question_prefix} Final Choice'] and
        row[f'{question_prefix} Starting Confidence'] < row[f'{question_prefix} Final Confidence']
    )

# Create a column to count such instances for each participant
for question_prefix in ['Q1', 'Q2', 'Q3']:
    clean_data[f'{question_prefix} Confidence Increase'] = clean_data.apply(
        lambda row: confidence_increase(row, question_prefix), axis=1
    )

# Total confidence increases per participant
clean_data['Total Confidence Increases'] = (
    clean_data['Q1 Confidence Increase'].astype(int) +
    clean_data['Q2 Confidence Increase'].astype(int) +
    clean_data['Q3 Confidence Increase'].astype(int)
)

# Group by Modality to count the total instances where confidence increased
confidence_increase_counts = clean_data.groupby('Modality')['Total Confidence Increases'].sum()

# Display the counts
print("Confidence Increases by Modality:")
print(confidence_increase_counts)

Confidence Increases by Modality:
Modality
Audio    9
Text     9
Name: Total Confidence Increases, dtype: int32
