In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# Read experiment data

In [3]:
# Qualtrics data, extra rows contain headers
df_headers = pd.read_csv('/'.join(['data','AI+Learning+Research.csv']), header=[0,1,2])
# Choice fields stored as text strings
df_original = pd.read_csv('/'.join(['data','AI+Learning+Research.csv']), header=[0], skiprows=[1, 2], parse_dates=['StartDate', 'EndDate', 'RecordedDate'])
df_values = pd.read_csv('/'.join(['data','AI+Learning+Research_values.csv']), header=[0], skiprows=[1, 2])
# Add suffix to value columns
suffix = '_val'
df_values = df_values.add_suffix(suffix)
df_values['ResponseId'] = df_values['ResponseId'+suffix]
# Specify the column names for merging
merge_columns = list(map(lambda col: col + suffix, [
    'Q2_1', 'Q2_2', 'Q2_3', 'Q2_4', 
    'Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 
    'Q6', 'Q7', 
    'Q18_1', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6', 
    'Q19'
]))
# Merge the values
df_merged = df_original.merge(df_values[merge_columns + ['ResponseId']], on='ResponseId', how='left')

df = df_merged.copy()

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Define experiment start and end dates
experiment_start = pd.to_datetime('February 19, 2024')
experiment_end = pd.to_datetime('February 24, 2024') # One day after the experiment
df = df[(df['StartDate'] >= experiment_start) & (df['EndDate'] <= experiment_end)]
df_datebound = df.copy()

In [4]:
# Manually extracted data
df_manual = pd.read_csv('/'.join(['output','filtered_df.csv']))

df_errors = pd.read_csv('/'.join(['data','BRC FC W2 Lab Log Spring 24.csv']))

In [5]:
# Filter out errors from df
df = df[~df['Q1'].isin(df_errors['Participant ID'])]

# Check for non-participants
df = df[df['Q1'].str.isdigit()]

# Check for users that did not finish the test properly
df = df[df['Progress'].astype(int)==100]

# Drop duplicates in the Participant ID, only keeping the latest recorded submission
df = df.sort_values(by='RecordedDate', ascending=False).drop_duplicates(subset=['Q1'])

# Set experiment condition to 1 if the special condition applies, or 0 if the control is in place
def set_condition_experiment(row):
    if pd.notna(row['Q10']) or pd.notna(row['Q12']):
        return 1
    elif pd.notna(row['Q14']) or pd.notna(row['Q17']):
        return 0
    else:
        return pd.NA
df['condition_experiment'] = df.apply(set_condition_experiment, axis=1)
df = df[df['condition_experiment'] != pd.NA]

# Filtered data
df_filtered = df.copy()

# Filtered totals
print(f"filtered/timeframe/total: {len(df_filtered)}/{len(df_datebound)}/{len(df_original)}")

df.to_csv('/'.join(['output','clean.csv']), index=False)
df.head().transpose()

filtered/timeframe/total: 311/354/378


Unnamed: 0,334,333,332,331,330
DistributionChannel,anonymous,anonymous,anonymous,anonymous,anonymous
Duration (in seconds),808,767,918,894,1028
EndDate,2024-02-23 17:27:22,2024-02-23 17:27:14,2024-02-23 16:32:15,2024-02-23 16:32:14,2024-02-23 16:31:34
ExternalReference,,,,,
Finished,True,True,True,True,True
...,...,...,...,...,...
ResponseId,R_1eOZk23XMHZMM4f,R_5V1SmFk9wLGCcaV,R_1dXY5K6kRdWhztv,R_3v2GsCa5UlKXPES,R_5S7eRhBVhkocrkM
StartDate,2024-02-23 17:13:54,2024-02-23 17:14:26,2024-02-23 16:16:56,2024-02-23 16:17:20,2024-02-23 16:14:25
Status,IP Address,IP Address,IP Address,IP Address,IP Address
UserLanguage,EN,EN,EN,EN,EN


In [6]:
new_df1 = df.dropna(subset=['Q10'])
new_df2 = df.dropna(subset=['Q12'])
new_df3 = df.dropna(subset=['Q14'])
new_df4 = df.dropna(subset=['Q17'])

# Jaccard distance calculations

In [7]:
import sklearn.metrics as metrics
import nltk # First time only: nltk.download('all')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# create preprocess_text function
def tokenize_text(text):
    if not isinstance(text, str):
        return []

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return lemmatized_tokens
 
def jaccard_similarity(set1, set2):
    # intersection of two sets
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
    return intersection / union

# Tokenizing each question list
tokens_chatgpt = tokenize_text(
"""
Pros of Social Media:
Connectivity: Enables instant communication and connection with friends, family, and a global network.
Information Sharing: Facilitates the rapid dissemination of news, trends, and information.

Cons of Social Media:
Privacy Concerns: Raises issues regarding the protection of personal information and privacy.
Misinformation: Provides a platform for the spread of false information and rumors.
"""
)
tokens_treatment1 = df['Q10'].apply(tokenize_text)
tokens_treatment2 = df['Q12'].apply(tokenize_text)
tokens_control1 = df['Q14'].apply(tokenize_text)
tokens_control2 = df['Q17'].apply(tokenize_text)

# Calculate the Jaccard index between product first and final writing in control and treatment condition

In [8]:
# Calculating jaccard distance between first and final writing for control and treatment
for i in df.index:
    tokens1 = tokens_treatment1[i]
    tokens2 = tokens_treatment2[i]
    if tokens1 and tokens2:
        df.loc[i, 'jaccard_similarity'] = jaccard_similarity(set(tokens1), set(tokens2))
    tokens1 = tokens_control1[i]
    tokens2 = tokens_control2[i]
    if tokens1 and tokens2:
        df.loc[i, 'jaccard_similarity'] = jaccard_similarity(set(tokens1), set(tokens2))

# There is not significant difference between Jaccard index of (first and final writing) in control and treatment condition

In [9]:
from scipy import stats

result_df_0 = df[df['condition_experiment'] == 0]['jaccard_similarity']
result_df_1 = df[df['condition_experiment'] == 1]['jaccard_similarity']

column_0 = result_df_0.sort_values()
column_1 = result_df_1.sort_values()

# Perform the t-test
t_statistic, p_value = stats.ttest_ind(column_0, column_1, nan_policy='omit')

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)


T-Statistic: -1.3195290320357356
P-Value: 0.1879723153689186


In [10]:
df[df['condition_experiment'] == 1][['Q10','Q12','jaccard_similarity']].sort_values(by='jaccard_similarity')

Unnamed: 0,Q10,Q12,jaccard_similarity
86,Pros: \n1. easy to reach out friends and famil...,i think the chatgpt's answer is better than mi...,0.000000
174,Pros of social media\n1. I get a chance to rec...,Pros of Social Media:\nConnectivity: Enables i...,0.067797
46,Pros of Social Media:\n1. searchability: findi...,Pros of Social Media:\nConnectivity: Enables i...,0.097561
78,increased anxiety is a con\nwaste of time is a...,I would stick to mine above. Chat gpt gives br...,0.139535
284,Pros:\n1. Connectivity: Allows friendships and...,Pros of Social Media:\nConnectivity: Enables i...,0.140845
...,...,...,...
195,Pros of Social Media:\n1) Connectivity: Enable...,Pros of Social Media:\n1) Connectivity: Enable...,1.000000
212,Pros:\n\nGlobal sharing: Ability to connect wi...,Pros:\n\nGlobal sharing: Ability to connect wi...,1.000000
52,Pros of social media:\n 1. Connection: I ca...,Pros of social media:\n1. Connection: I can co...,1.000000
179,Pros of Social Media:\n1. Interaction: being a...,Pros of Social Media:\n1. Interaction: being a...,1.000000


In [11]:
sorted_result_df_0 = result_df_0.sort_values().reset_index(drop=True)
sorted_result_df_1 = result_df_1.sort_values().reset_index(drop=True)

# Print the sorted DataFrames
print(pd.concat([sorted_result_df_0, sorted_result_df_1], axis=1).tail(25))

     jaccard_similarity  jaccard_similarity
133            0.688525            0.674419
134            0.690909            0.690141
135            0.692308            0.703704
136            0.709091            0.727273
137            0.718750            0.772727
138            0.732394            0.844444
139            0.732394            0.918033
140            0.754717            0.931818
141            0.818182            0.941176
142            0.835616            0.951220
143            0.859375            0.958333
144            0.901961            0.958333
145            0.914894            0.986301
146            0.930233            1.000000
147            0.953488            1.000000
148            0.979167            1.000000
149            1.000000            1.000000
150            1.000000            1.000000
151            1.000000            1.000000
152            1.000000            1.000000
153                 NaN            1.000000
154                 NaN         

# Testing final writing against ChatGPT response

In [12]:
# Calculating jaccard distance between ChatGPT and final writing for control and treatment
for i in df.index:
    tokens = tokens_treatment2[i]
    if tokens:
        df.loc[i, 'jaccard_gpt'] = jaccard_similarity(set(tokens_chatgpt), set(tokens))
    tokens = tokens_control2[i]
    if tokens:
        df.loc[i, 'jaccard_gpt'] = jaccard_similarity(set(tokens_chatgpt), set(tokens))

In [13]:
result_df_0 = df[df['condition_experiment'] == 0]['jaccard_gpt']
result_df_1 = df[df['condition_experiment'] == 1]['jaccard_gpt']

column_0 = result_df_0.sort_values()
column_1 = result_df_1.sort_values()

# Perform the t-test
t_statistic, p_value = stats.ttest_ind(column_0, column_1, nan_policy='omit')

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)


T-Statistic: 1.5592236481706745
P-Value: 0.11997035129475946


In [14]:
sorted_result_df_0 = result_df_0.sort_values().reset_index(drop=True)
sorted_result_df_1 = result_df_1.sort_values().reset_index(drop=True)

# Print the sorted DataFrames
print(pd.concat([sorted_result_df_0, sorted_result_df_1], axis=1).tail(25))

     jaccard_gpt  jaccard_gpt
133     0.521739     0.452830
134     0.521739     0.454545
135     0.522727     0.500000
136     0.549020     0.519231
137     0.549020     0.529412
138     0.555556     0.571429
139     0.557692     0.581818
140     0.571429     0.590909
141     0.577778     0.596154
142     0.584906     0.625000
143     0.641509     0.625000
144     0.652174     0.644444
145     0.680851     0.666667
146     0.681818     0.674419
147     0.744681     0.674419
148     0.790698     0.674419
149     0.818182     0.714286
150     0.921053     0.750000
151     0.923077     0.923077
152     0.947368     0.947368
153          NaN     1.000000
154          NaN     1.000000
155          NaN     1.000000
156          NaN     1.000000
157          NaN          NaN


# Calculating jaccard distance between GPT and initial writing for control and treatment

In [17]:
#tokens_treatment1 = df['Q10'].apply(tokenize_text)
#tokens_treatment2 = df['Q12'].apply(tokenize_text)
#tokens_control1 = df['Q14'].apply(tokenize_text)
#tokens_control2 = df['Q17'].apply(tokenize_text)

for i in df.index:
    tokens1 = tokens_treatment1[i]
    tokens2 = tokens_chatgpt
    if tokens1 and tokens2:
        df.loc[i, 'jaccard_gpt_initial'] = jaccard_similarity(set(tokens1), set(tokens2))
    tokens1 = tokens_control1[i]
    tokens2 = tokens_chatgpt
    if tokens1 and tokens2:
        df.loc[i, 'jaccard_gpt_initial'] = jaccard_similarity(set(tokens1), set(tokens2))

gpt_initial_0 = df[df['condition_experiment'] == 0]['jaccard_gpt_initial']
gpt_final_0 = df[df['condition_experiment'] == 0]['jaccard_gpt']
gpt_initial_1 = df[df['condition_experiment'] == 1]['jaccard_gpt_initial']
gpt_final_1 = df[df['condition_experiment'] == 1]['jaccard_gpt']

column_0 = gpt_initial_1.sort_values()
column_1 = gpt_final_1.sort_values()

# Perform the t-test
t_statistic, p_value = stats.ttest_ind(column_0, column_1, nan_policy='omit')

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)


T-Statistic: -7.9014127991289955
P-Value: 4.733774934507111e-14


# Save output for Jaccard indices

In [19]:
df.to_csv('/'.join(['output','jaccard_analysis_extended.csv']), index=False)