# Pre-Processing of Platform Data -  Group with Randomization

Data for the Validation of Constructs

Some cleaning steps were directly done in Excel:
- Replaced the German column names with the English ones
- Removed columns that were different from the Lab Data

-> Look at readme-data file for more info

v2_18.04.2024

Cleaning

In [3]:
import pandas as pd
import numpy as np

In [43]:
# Load the dataset
df = pd.read_csv('2024_04_14_results-platform.csv')

In [44]:
# Take a look at the data structure
df.head()

Unnamed: 0,_id,user_id,group_nr,task_nr,tracking type,action,timestamp,who,text
0,65ddae33def0ea9b7c8f2dcb,1000300,1.0,-1,,,,,
1,65ddae33def0ea9b7c8f2dcc,1000300,1.0,-1,MOUSE,Intro submit button clicked!,27/02/2024 09:41:07,,
2,65ddae35def0ea9b7c8f2dcd,1000300,1.0,1,MOUSE,Answer box is selected!,27/02/2024 09:41:09,,
3,65ddae36def0ea9b7c8f2dce,1000300,1.0,1,MOUSE,Answer box is changed!,27/02/2024 09:41:10,,
4,65ddae3cdef0ea9b7c8f2dcf,1000300,1.0,1,MOUSE,Answer box is de-selected!,27/02/2024 09:41:16,,


In [45]:
# Drop the unnecessary columns
columns_to_drop = ["_id", "task_nr", "tracking type", "action", "timestamp"]
df.drop(columns=columns_to_drop, inplace=True)

In [46]:
# Rename the column "user_id" to "VPNCode"
df.rename(columns={"user_id": "VPNCode"}, inplace=True)

In [47]:
# Filter the dataset to include only rows with the specified VPNCode values
vpn_codes_to_keep = [
    "ZZEBA0308", "ZZNRO406", "ZZSSA0301", "ZZAWA0212", "ZZKO1608",
    "ZZNWA1604", "ZZALA3008", "ZZUVI2004", "ZZAUL2008", "ZZRHA2012",
    "ZZUWA3006", "ZZAOT0412", "ZZNBI2609", "ZZUST0208", "ZZASE1301",
    "ZZELU0612", "ZZUFL0505", "ZZEDR2902", "ZZEAN0306", "ZZIBI2311",
    "ZZEKI2929", "ZZAFE1203", "ZZIWA2610", "ZZAGE607", "ZZAH0702",
    "ZZEFO1204", "ZZIBO2610", "ZZAKO1412", "ZZADE1405", "ZZDSC0806",
    "ZZEDH1501", "ZZINe2902", "ZZHMU0112", "ZZAKY2105", "ZZeKa1405",
    "ZZIKO0401", "ZZAZW0718", "ZZYPE1411", "ZZANI25", "ZZOHA2303",
    "ZZAWH2506", "ZZize1717", "ZZRDO1608", "ZZEOT3110", "ZZICO1912",
    "ZZTHA1907", "ZZAMA0101", "ZZESA3011", "ZZEFR1909", "ZZEES0811",
    "ZZAKI601", "klief99", "ZZAMU1201"
]

df = df[df['VPNCode'].isin(vpn_codes_to_keep)]

In [48]:
# Drop rows where either 'who' or 'text' column is empty
df.dropna(subset=['who', 'text'], inplace=True)

In [49]:
# Look at the data
df.head()

Unnamed: 0,VPNCode,group_nr,who,text
2036,ZZUVI2004,5.0,user (answering),"Erstellen Sie eine Veranstaltung, die sowohl i..."
2057,ZZKO1608,2.0,user (answering),\nDie Kommunikationsstrategie zur Einführung v...
2084,ZZEBA0308,6.0,user (prompting),Bitte hilf mir bei folgender aufgabe:\nEs ist ...
2085,ZZEBA0308,6.0,GPT,# Eintägige Veranstaltung zur digitalen Transf...
2093,ZZSSA0301,5.0,user (answering),Zielsetzung:\nErstellen Sie eine Veranstaltung...


In [50]:
# Filter out rows where 'who' is not 'user (prompting)'
prompting_df = df[df['who'] == 'user (prompting)']

In [51]:
# Group by 'VPNCode' and aggregate prompts into a single string, count them, and calculate some potentially interesting values
prompting_summary = df[df['who'] == 'user (prompting)'].groupby('VPNCode').agg(
    All_Prompts=('text', ' || '.join),  # Concatenate all prompts
    Prompt_Count=('text', 'count'),  # Count prompts
    Average_Word_Count=('text', lambda x: np.mean([len(prompt.split()) for prompt in x])),  # Average words per prompt
    First_Prompt_Word_Count=('text', lambda x: len(x.iloc[0].split()) if not x.empty else 0),  # Words in the first prompt (as it could show how clearly people prompted and if they used the framework)
    Total_Word_Count=('text', lambda x: sum(len(prompt.split()) for prompt in x))  # Total words in all prompts
).reset_index()

In [52]:
# Merge this summary back with the original dataframe to associate each user with their prompting summary
# Left join to ensure all users are included
df_merged = pd.merge(df, prompting_summary, on='VPNCode', how='left')

In [53]:
# Drop the 'who' and 'text' columns as they are no longer needed
df_merged.drop(columns=['who', 'text'], inplace=True)

In [54]:
# Remove duplicate rows that may have resulted from the merge (since we've aggregated prompting info at the user level)
df_final = df_merged.drop_duplicates(subset=['VPNCode'])

In [55]:
# Cleaned and reorganized DataFrame
df_final.head()

Unnamed: 0,VPNCode,group_nr,All_Prompts,Prompt_Count,Average_Word_Count,First_Prompt_Word_Count,Total_Word_Count
0,ZZUVI2004,5.0,,,,,
1,ZZKO1608,2.0,Entwickeln Sie ein Konzept für eine denkwürdig...,1.0,54.0,54.0,54.0
2,ZZEBA0308,6.0,Bitte hilf mir bei folgender aufgabe:\nEs ist ...,1.0,116.0,116.0,116.0
4,ZZSSA0301,5.0,,,,,
5,ZZALA3008,5.0,"Hilf mir, ein Konzept für eine denkwürdige ein...",3.0,41.0,19.0,123.0


In [58]:
# As a lot of people were doing the experiment simoultanously so there are some missing values for the prompts. Not too big of a problem as this is not the main focus
# Check for missing values across the entire DataFrame
missing_values = df_final.isnull().sum()
print(missing_values[missing_values > 0])

All_Prompts                10
Prompt_Count               10
Average_Word_Count         10
First_Prompt_Word_Count    10
Total_Word_Count           10
dtype: int64


In [59]:
# Fill missing values with 0
df_final.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.fillna(0, inplace=True)


In [60]:
# Look at data
df_final.head()

Unnamed: 0,VPNCode,group_nr,All_Prompts,Prompt_Count,Average_Word_Count,First_Prompt_Word_Count,Total_Word_Count
0,ZZUVI2004,5.0,0,0.0,0.0,0.0,0.0
1,ZZKO1608,2.0,Entwickeln Sie ein Konzept für eine denkwürdig...,1.0,54.0,54.0,54.0
2,ZZEBA0308,6.0,Bitte hilf mir bei folgender aufgabe:\nEs ist ...,1.0,116.0,116.0,116.0
4,ZZSSA0301,5.0,0,0.0,0.0,0.0,0.0
5,ZZALA3008,5.0,"Hilf mir, ein Konzept für eine denkwürdige ein...",3.0,41.0,19.0,123.0


In [61]:
# Save DF in a new CSV-File 
df_final.to_csv('data_prep_cleaned.csv', index=False, encoding='utf-8-sig', sep=',')