# Pre-Processing of Platform Data -  Group with Randomization

Completely done in Python. There ore no steps required in Excel

v1_25.03.2024

Cleaning

In [41]:
import pandas as pd
import numpy as np

In [29]:
# Load the dataset
df = pd.read_csv('2024_03_22_Main.experimental_design.csv')

In [30]:
# Take a look at the data structure
df.head()

Unnamed: 0,_id,user_id,group_nr,task_nr,tracking type,action,timestamp,who,text
0,65ddae33def0ea9b7c8f2dcb,1000300,1.0,-1,,,,,
1,65ddae33def0ea9b7c8f2dcc,1000300,1.0,-1,MOUSE,Intro submit button clicked!,27/02/2024 09:41:07,,
2,65ddae35def0ea9b7c8f2dcd,1000300,1.0,1,MOUSE,Answer box is selected!,27/02/2024 09:41:09,,
3,65ddae36def0ea9b7c8f2dce,1000300,1.0,1,MOUSE,Answer box is changed!,27/02/2024 09:41:10,,
4,65ddae3cdef0ea9b7c8f2dcf,1000300,1.0,1,MOUSE,Answer box is de-selected!,27/02/2024 09:41:16,,


In [31]:
# Drop the unnecessary columns
columns_to_drop = ["_id", "task_nr", "tracking type", "action", "timestamp"]
df.drop(columns=columns_to_drop, inplace=True)

In [32]:
# Rename the column "user_id" to "VPNCode"
df.rename(columns={"user_id": "VPNCode"}, inplace=True)

In [33]:
# Filter the dataset to include only rows with the specified VPNCode values
vpn_codes_to_keep = ["AHO1006", "AWA2505", "EZA0703", "AGU2207", "NPR1810", 
                     "RBO0106", "DPR0408", "EDE0211", "OBA0612", "LCH0603", 
                     "IST0303", "EBO1608", "EUN2402", "ABE0509", "SME3112", 
                     "OLI1612", "NBA1601"]

df = df[df['VPNCode'].isin(vpn_codes_to_keep)]

In [34]:
# Drop rows where either 'who' or 'text' column is empty
df.dropna(subset=['who', 'text'], inplace=True)

In [35]:
# Look at the data
df

Unnamed: 0,VPNCode,group_nr,who,text
1059,AHO1006,2.0,user (answering),Wir möchten das Thema Nachhaltigkeit in unsere...
1073,AHO1006,2.0,user (prompting),Lösen Sie die Aufgabe aus einer Ich Perspektiv...
1074,AHO1006,2.0,GPT,"Titel der Veranstaltung: ""Generative KI: Pioni..."
1079,AHO1006,2.0,user (answering),"Titel der Veranstaltung: ""Generative KI: Pioni..."
1086,AWA2505,2.0,user (answering),"Das Unternehmen wird, aufgrund der zunehmenden..."
...,...,...,...,...
1721,OLI1612,2.0,user (answering),\nHier ist ein Vorschlag für eine Veranstaltun...
1732,NBA1601,2.0,user (answering),Nachhaltigkeitsstrategie der Mobilität AG St. ...
1749,NBA1601,2.0,user (prompting),Du hast folgende Aufgabe: Du bist Consultant i...
1750,NBA1601,2.0,GPT,"Konzept: ""Shaping the Future: Digitale Transfo..."


In [36]:
# Filter out rows where 'who' is not 'user (prompting)'
prompting_df = df[df['who'] == 'user (prompting)']

In [42]:
# Group by 'VPNCode' and aggregate prompts into a single string, count them, and calculate the average word count
prompting_summary = df[df['who'] == 'user (prompting)'].groupby('VPNCode').agg(
    All_Prompts=('text', ' || '.join),  # Concatenate all prompts
    Prompt_Count=('text', 'count'),  # Count prompts
    Average_Word_Count=('text', lambda x: np.mean([len(prompt.split()) for prompt in x])),  # Average words per prompt
    First_Prompt_Word_Count=('text', lambda x: len(x.iloc[0].split()) if not x.empty else 0),  # Words in the first prompt corrected
    Total_Word_Count=('text', lambda x: sum(len(prompt.split()) for prompt in x))  # Total words in all prompts
).reset_index()

In [43]:
# Merge this summary back with the original dataframe to associate each user with their prompting summary
# This uses a left join to ensure all users are included, even those without prompts, resulting in NaN values for users without prompts which can be filled or processed as needed
df_merged = pd.merge(df, prompting_summary, on='VPNCode', how='left')

In [44]:
# Drop the 'who' and 'text' columns as they are no longer needed
df_merged.drop(columns=['who', 'text'], inplace=True)

In [45]:
# Remove duplicate rows that may have resulted from the merge (since we've aggregated prompting info at the user level)
df_final = df_merged.drop_duplicates(subset=['VPNCode'])

In [46]:
# Now df_final is your cleaned and reorganized DataFrame
df_final

Unnamed: 0,VPNCode,group_nr,All_Prompts,Prompt_Count,Average_Word_Count,First_Prompt_Word_Count,Total_Word_Count
0,AHO1006,2.0,Lösen Sie die Aufgabe aus einer Ich Perspektiv...,1,105.0,105,105
4,AWA2505,2.0,Ich muss ein Konzept für eine eintägige Verans...,2,18.5,25,37
10,EZA0703,5.0,Du arbeitest in einem Büro. Weisst wie man gut...,2,44.0,65,88
16,AGU2207,7.0,Bitte hilf mir bei der Erstellung einer Kommun...,2,26.0,38,52
22,NPR1810,7.0,Bitte nenne Ideen für mehr Nachhaltigkeit in U...,1,8.0,8,8
26,RBO0106,8.0,"Entwickeln Sie eine umfassende, einwöchige Kom...",1,24.0,24,24
30,DPR0408,5.0,Kannst du mir den Ablauf für eine eintägige V...,4,15.0,25,60
40,EDE0211,7.0,"GPT, ich stehe vor dieser Arbeit und lass uns ...",10,40.9,141,409
62,OBA0612,4.0,Ich soll eine einwöchige Kommunikationsstrateg...,1,116.0,116,116
66,LCH0603,3.0,kreiere eine einwöchige Kommunikationsstrateg...,2,35.5,13,71


In [49]:
# Save DF in a new CSV-File 
df_final.to_csv('data_prep_cleaned.csv', index=False, encoding='utf-8-sig', sep=',')