# Pre-Processing of Platform Data -  Group without Randomization

Completely done in Python. There ore no steps required in Excel

v1_28.03.2024

Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('2024_03_22_Main.experimental_design.csv')

In [3]:
# Take a look at the data structure
df.head()

Unnamed: 0,_id,user_id,group_nr,task_nr,tracking type,action,timestamp,who,text
0,65ddae33def0ea9b7c8f2dcb,1000300,1.0,-1,,,,,
1,65ddae33def0ea9b7c8f2dcc,1000300,1.0,-1,MOUSE,Intro submit button clicked!,27/02/2024 09:41:07,,
2,65ddae35def0ea9b7c8f2dcd,1000300,1.0,1,MOUSE,Answer box is selected!,27/02/2024 09:41:09,,
3,65ddae36def0ea9b7c8f2dce,1000300,1.0,1,MOUSE,Answer box is changed!,27/02/2024 09:41:10,,
4,65ddae3cdef0ea9b7c8f2dcf,1000300,1.0,1,MOUSE,Answer box is de-selected!,27/02/2024 09:41:16,,


In [4]:
# Drop the unnecessary columns
columns_to_drop = ["_id", "task_nr", "tracking type", "action", "timestamp"]
df.drop(columns=columns_to_drop, inplace=True)

In [5]:
# Rename the column "user_id" to "VPNCode"
df.rename(columns={"user_id": "VPNCode"}, inplace=True)

In [6]:
# Filter the dataset to include only rows with the specified VPNCode values
vpn_codes_to_keep = ["ABO2606", "APE2704", "SSG0102", "ECH2807", "AST1210", 
                     "IHE1103", "OME0709", "NBA0408", "AOD2103", "IME0910", 
                     "VBA1706", "AFU0708", "USG0407"]

df = df[df['VPNCode'].isin(vpn_codes_to_keep)]

In [7]:
# Drop rows where either 'who' or 'text' column is empty
df.dropna(subset=['who', 'text'], inplace=True)

In [8]:
# Look at the data
df

Unnamed: 0,VPNCode,group_nr,who,text
249,ABO2606,1.0,user (answering),One-Pager - Kommunikationsstrategie zur Einfue...
271,ABO2606,1.0,user (prompting),Ich benötige Unterstützung bei der Entwicklung...
272,ABO2606,1.0,GPT,"Veranstaltungskonzept: ""Digitale Horizonte: Ge..."
281,ABO2606,1.0,user (prompting),Welche neuen Erkenntnisse können die Führungsk...
282,ABO2606,1.0,GPT,"Konzept für die Veranstaltung ""Führungsdialog ..."
...,...,...,...,...
1030,USG0407,5.0,GPT,Konzept: Eintägiges Führungskräfte-Forum zur D...
1036,USG0407,5.0,user (prompting),Konzept finalisieren und endgültig erstellen
1037,USG0407,5.0,GPT,"1. Titel der Veranstaltung: ""Digitale Transfor..."
1044,USG0407,5.0,user (answering),Konzept: Eintägiges Führungskräfte-Forum zur D...


In [9]:
# Filter out rows where 'who' is not 'user (prompting)'
prompting_df = df[df['who'] == 'user (prompting)']

In [10]:
# Group by 'VPNCode' and aggregate prompts into a single string, count them, and calculate the average word count
prompting_summary = df[df['who'] == 'user (prompting)'].groupby('VPNCode').agg(
    All_Prompts=('text', ' || '.join),  # Concatenate all prompts
    Prompt_Count=('text', 'count'),  # Count prompts
    Average_Word_Count=('text', lambda x: np.mean([len(prompt.split()) for prompt in x])),  # Average words per prompt
    First_Prompt_Word_Count=('text', lambda x: len(x.iloc[0].split()) if not x.empty else 0),  # Words in the first prompt corrected
    Total_Word_Count=('text', lambda x: sum(len(prompt.split()) for prompt in x))  # Total words in all prompts
).reset_index()

In [11]:
# Merge this summary back with the original dataframe to associate each user with their prompting summary
# This uses a left join to ensure all users are included, even those without prompts, resulting in NaN values for users without prompts which can be filled or processed as needed
df_merged = pd.merge(df, prompting_summary, on='VPNCode', how='left')

In [12]:
# Drop the 'who' and 'text' columns as they are no longer needed
df_merged.drop(columns=['who', 'text'], inplace=True)

In [13]:
# Remove duplicate rows that may have resulted from the merge (since we've aggregated prompting info at the user level)
df_final = df_merged.drop_duplicates(subset=['VPNCode'])

In [14]:
# Now df_final is your cleaned and reorganized DataFrame
df_final

Unnamed: 0,VPNCode,group_nr,All_Prompts,Prompt_Count,Average_Word_Count,First_Prompt_Word_Count,Total_Word_Count
0,ABO2606,1.0,Ich benötige Unterstützung bei der Entwicklung...,3,39.333333,100,118
8,APE2704,3.0,\nHilf mir diese Aufgabe zu lösen:\n\n\nEs ist...,4,108.75,128,435
18,SSG0102,3.0,Wie wichtig ist Nachhaltigkeit heute || wie ka...,7,11.571429,5,81
34,ECH2807,4.0,"Hey ChatGPT, du bist ein Marketing-Genie, das ...",4,55.75,57,223
43,AST1210,3.0,"Hallo, kannst du mir ein Kommunikationskonzept...",2,20.5,27,41
49,IHE1103,2.0,Hilf mir bei folgender Aufgabe - du wirst von ...,3,22.333333,52,67
57,OME0709,4.0,"Hilfe mir beim organisieren einer umfassende, ...",4,9.5,17,38
67,NBA0408,2.0,Du bist ein Mitarbeiter in einem Unternehmen u...,1,117.0,117,117
71,AOD2103,1.0,"Hilfe mir, eine kurze aber informative eintägi...",4,42.5,40,170
81,IME0910,4.0,Du bist ein expert in a consulting firma und u...,2,29.0,33,58


In [15]:
# Save DF in a new CSV-File 
df_final.to_csv('data_prep_cleaned.csv', index=False, encoding='utf-8-sig', sep=',')