# Pre-Processing of Data -  Group with Randomization

Data for the Validation of Constructs

Some cleaning steps were directly done in Excel:
- Replaced the German column names with the English ones
- Removed columns that were different from the Lab Data

-> Look at readme-data file for more info

v2_18.04.2024

Cleaning and Coding

In [1]:
import pandas as pd

In [16]:
# Load dataset
df = pd.read_excel('2024_04_14_results-survey.xlsx')

In [17]:
# Look at the data
df.head()

Unnamed: 0,id,submitdate,lastpage,Zufallsgeneratorstartwert,startdate,datestamp,VPNCode,Gender,AGE,Edu,...,MANI1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,TASKDIF2Time,PROD2Time,MANI2Time
0,1,2024-04-12 13:45:17,6.0,79774926,2024-04-12 13:04:09,2024-04-12 13:45:17,ZZAHE0508,Männlich,26.0,Bachelor,...,,52.08,,201.09,,,,,,
1,2,2024-04-12 13:40:12,6.0,1555391928,2024-04-12 13:04:27,2024-04-12 13:40:12,ZZEBA0308,Weiblich,25.0,Bachelor,...,,3.22,,238.06,,,,,,
2,3,2024-04-12 13:33:43,6.0,583800965,2024-04-12 13:04:38,2024-04-12 13:33:43,ZZNRO406,Weiblich,30.0,Bachelor,...,,417.03,,202.43,,,,,,
3,4,2024-04-12 13:30:12,6.0,716396962,2024-04-12 13:04:42,2024-04-12 13:30:12,ZZSSA0301,Weiblich,22.0,Abitur oder Fachabitur,...,,167.23,,386.9,,,,,,
4,5,,2.0,1985388160,2024-04-12 13:05:02,2024-04-12 13:09:52,ZZIPU0306,Weiblich,32.0,Bachelor,...,,,,,,,,,,


In [18]:
# Making "VPNCode" the first column
cols = ['VPNCode'] + [col for col in df if col != 'VPNCode']
df = df[cols]
df.head()

Unnamed: 0,VPNCode,id,submitdate,lastpage,Zufallsgeneratorstartwert,startdate,datestamp,Gender,AGE,Edu,...,MANI1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,TASKDIF2Time,PROD2Time,MANI2Time
0,ZZAHE0508,1,2024-04-12 13:45:17,6.0,79774926,2024-04-12 13:04:09,2024-04-12 13:45:17,Männlich,26.0,Bachelor,...,,52.08,,201.09,,,,,,
1,ZZEBA0308,2,2024-04-12 13:40:12,6.0,1555391928,2024-04-12 13:04:27,2024-04-12 13:40:12,Weiblich,25.0,Bachelor,...,,3.22,,238.06,,,,,,
2,ZZNRO406,3,2024-04-12 13:33:43,6.0,583800965,2024-04-12 13:04:38,2024-04-12 13:33:43,Weiblich,30.0,Bachelor,...,,417.03,,202.43,,,,,,
3,ZZSSA0301,4,2024-04-12 13:30:12,6.0,716396962,2024-04-12 13:04:42,2024-04-12 13:30:12,Weiblich,22.0,Abitur oder Fachabitur,...,,167.23,,386.9,,,,,,
4,ZZIPU0306,5,,2.0,1985388160,2024-04-12 13:05:02,2024-04-12 13:09:52,Weiblich,32.0,Bachelor,...,,,,,,,,,,


In [19]:
# 'id' values with problems that need to be dropped
ids_to_drop = [
    5, 14, 17, 19, 24, 25, 26, 36, 37, 42, 49, 50, 52, 54, 56, 57,
    62, 65, 66, 76, 77, 81, 82, 1, 6, 7, 46, 51, 53, 74
]

# Drop rows where the 'id' column is in the ids_to_drop list
df = df[~df['id'].isin(ids_to_drop)]

In [20]:
# Drop unuseful columns (# JC1[ICJD1] dropped as not measured during First Design w/o Randomization)
columns_to_drop = ['id', 'submitdate', 'lastpage', 'LinktoTool', 'E2', 
                   'VPNCodeTime', 'GenderTime', 'AGETime', 'EduTime', 'WORKTime', 'AILiteracyTime', 
                   'PGATTime', 'NGATTime', 'CMVTime', 'NEOTime', 'ERKTime', 'LinktoToolTime', 
                   'TESTTime', 'JC1Time', 'SE1Time', 'SDT1Time', 'PROD1Time', 'TASKDIF1Time', 
                   'MANI1Time', 'E2Time', 'JC2Time', 'SE2Time', 'SDT2Time', 'PROD2Time', 
                   'TASKDIF2Time', 'MANI2Time', 'datestamp', 'startdate', 'interviewtime', 'ERK', 'TEST',
                   'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536',
                   'AGE', 'WORK', 'Gender', 'Edu', 'JC2[AC]', 'SE1', 'SE2', 'MANI1', 'MANI2', 'JC1[ICJD1]',
                   'SDT1[SDT1]', 'SDT1[SDT2]', 'SDT2[2SDT1]', 'SDT2[2SDT2]', 'PROD1[SQ001]', 'TASKDIF1[SQ001]', 'PROD2[SQ001]','TASKDIF2[SQ001]'
                   ]
df.drop(columns=columns_to_drop, inplace=True)

In [21]:
# Identify rows with missing values
missing_info = df[df.isnull().any(axis=1)]

# Report VPNCode and column names with missing values
for index, row in missing_info.iterrows():
    missing_columns = row[row.isnull()].index.tolist()
    vpn_code = row['VPNCode']
    print(f"VPNCode: {vpn_code}, Missing in columns: {missing_columns}")

In [22]:
# Convert Likert scale responses to numerical codes -> Changes in the mapping were done
likert_mapping = {
    "Trifft gar nicht zu": 1,
    "Trifft eher nicht zu": 2,
    "Teils, teils": 3,
    "Trifft teilweise zu": 4,
    "Trifft voll und ganz zu": 5,
    "Gar nicht zufriedenstellend": 1,
    "Eher nicht zufriedenstellend": 2,
    "Eher zufriedenstellend": 4,
    "Voll zufriedenstellend": 5,
    "Extrem schwierig": 1,
    "Eher schwierig": 2,
    "Eher leicht": 4,
    "Extrem leicht": 5
}

In [23]:
# Apply mapping to all columns where conversion is needed.
for column in df.select_dtypes(include='object').columns:  #Only object-type columns need conversion
    if df[column].isin(likert_mapping.keys()).any():
        df[column] = df[column].map(likert_mapping)

In [25]:
# AI Literacy Constructs
df['AILiteracy[Use]'] = df[['AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]']].mean(axis=1)
df['AILiteracy[Kno]'] = df[['AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]']].mean(axis=1)
df['AILiteracy[Det]'] = df[['AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]']].mean(axis=1)
df['AILiteracy[Eth]'] = df[['AILiteracy[Eth1]', 'AILiteracy[Eth3]']].mean(axis=1)

In [26]:
# General Attitudes towards AI Constructs
df['PGAT'] = df[['PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]']].mean(axis=1)
df['NGAT'] = df[['NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]']].mean(axis=1)

In [27]:
# Common Method Bias Construct
df['CMV'] = df[['CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]']].mean(axis=1)

In [28]:
# Big Five Personality Traits (NEO) Constructs
df['NEO[E]'] = df[['NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]']].mean(axis=1)
df['NEO[A]'] = df[['NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]']].mean(axis=1)
df['NEO[C]'] = df[['NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]']].mean(axis=1)
df['NEO[N]'] = df[['NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]']].mean(axis=1)
df['NEO[O]'] = df[['NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]']].mean(axis=1)

In [29]:
# Job Crafting Constructs
df['JC1[IStR]'] = df[['JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]']].mean(axis=1)
df['JC1[HRJD]'] = df[['JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']].mean(axis=1)
df['JC2[IStR]'] = df[['JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]']].mean(axis=1)
df['JC2[HRJD]'] = df[['JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]']].mean(axis=1)

In [30]:
# Display the dataset
df.head()

Unnamed: 0,VPNCode,Zufallsgeneratorstartwert,AILiteracy[Use1],AILiteracy[Use2],AILiteracy[Use3],AILiteracy[Use4],AILiteracy[Use5],AILiteracy[Use6],AILiteracy[Kno1],AILiteracy[Kno2],...,CMV,NEO[E],NEO[A],NEO[C],NEO[N],NEO[O],JC1[IStR],JC1[HRJD],JC2[IStR],JC2[HRJD]
1,ZZEBA0308,1555391928,5,5,5,5,5,5,4,5,...,3.666667,4.25,2.75,5.0,4.25,4.0,3.6,5.0,2.2,1.75
2,ZZNRO406,583800965,2,4,4,4,4,2,5,3,...,3.666667,3.0,3.25,3.75,3.25,4.0,2.4,3.333333,3.8,4.0
3,ZZSSA0301,716396962,5,5,5,5,5,5,5,5,...,5.0,3.5,3.0,4.0,2.25,4.0,3.2,2.333333,4.2,4.25
7,ZZAWA0212,875831755,5,5,5,5,5,5,5,5,...,2.666667,3.25,2.5,4.0,2.75,4.4,2.4,5.0,2.2,5.0
8,ZZKO1608,405906470,5,4,4,4,4,4,3,3,...,3.666667,2.75,2.5,3.25,2.5,2.8,3.8,3.333333,3.6,3.5


In [31]:
# Drop unuseful columns 
df.drop(columns=['Zufallsgeneratorstartwert'], inplace=True)

In [33]:
# Print the list of column names
column_names = df.columns.tolist()
print(column_names)

['VPNCode', 'AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]', 'AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]', 'AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]', 'AILiteracy[Eth1]', 'AILiteracy[Eth3]', 'PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]', 'NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]', 'CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]', 'NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]', 'NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]', 'NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]', 'NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]', 'NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]', 'JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]', 'JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]', 'JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]', 'JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]', 'AILiteracy[Use]', 'AILiteracy

In [34]:
# Save DF in a new CSV-File 
df.to_csv('data_prep_validation_pre-merge_survey.csv', index=False, encoding='utf-8-sig', sep=',')