# Pre-Processing of Data -  Group with Randomization

Data for the Validation of Constructs

v1_12.04.2024

Cleaning and Coding

In [45]:
import pandas as pd

In [46]:
# Load dataset
df = pd.read_excel('results-survey199433_formatted.xlsx')

In [47]:
# Look at the data
df.head()

Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,VPNCode,Gender,AGE,...,MANI1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,18,2024-03-06 11:17:14,6,de,468020420,2024-03-06 10:58:20,2024-03-06 11:17:14,AHO1006,Männlich,26,...,,3.75,,133.28,,,,,,
1,19,2024-03-06 12:53:52,6,de,63043224,2024-03-06 12:13:14,2024-03-06 12:53:52,AWA2505,Weiblich,21,...,,913.02,,149.12,,,,,,
2,20,2024-03-07 10:39:51,6,de,1018593392,2024-03-07 10:07:31,2024-03-07 10:39:51,EZA0703,Männlich,22,...,,669.74,,151.61,,,,,,
3,21,2024-03-07 12:02:52,6,de,45327585,2024-03-07 11:19:01,2024-03-07 12:02:52,AGU2207,Männlich,23,...,,5.01,,831.72,,,,,,
4,22,2024-03-07 14:17:40,6,de,1480525241,2024-03-07 13:34:55,2024-03-07 14:17:40,NPR1810,Weiblich,22,...,,709.19,,150.17,,,,,,


In [48]:
# Making "VPNCode" the first column
cols = ['VPNCode'] + [col for col in df if col != 'VPNCode']
df = df[cols]
df.head()

Unnamed: 0,VPNCode,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,Gender,AGE,...,MANI1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,AHO1006,18,2024-03-06 11:17:14,6,de,468020420,2024-03-06 10:58:20,2024-03-06 11:17:14,Männlich,26,...,,3.75,,133.28,,,,,,
1,AWA2505,19,2024-03-06 12:53:52,6,de,63043224,2024-03-06 12:13:14,2024-03-06 12:53:52,Weiblich,21,...,,913.02,,149.12,,,,,,
2,EZA0703,20,2024-03-07 10:39:51,6,de,1018593392,2024-03-07 10:07:31,2024-03-07 10:39:51,Männlich,22,...,,669.74,,151.61,,,,,,
3,AGU2207,21,2024-03-07 12:02:52,6,de,45327585,2024-03-07 11:19:01,2024-03-07 12:02:52,Männlich,23,...,,5.01,,831.72,,,,,,
4,NPR1810,22,2024-03-07 14:17:40,6,de,1480525241,2024-03-07 13:34:55,2024-03-07 14:17:40,Weiblich,22,...,,709.19,,150.17,,,,,,


In [50]:
# Drop unuseful columns (# JC1[ICJD1] dropped as not measured during First Design w/o Randomization)
columns_to_drop = ['id', 'submitdate', 'lastpage', 'startlanguage', 'seed', 'LinktoTool', 'E2', 
                   'VPNCodeTime', 'GenderTime', 'AGETime', 'EduTime', 'WORKTime', 'AILiteracyTime', 
                   'PGATTime', 'NGATTime', 'CMVTime', 'NEOTime', 'ERKTime', 'LinktoToolTime', 
                   'TESTTime', 'JC1Time', 'SE1Time', 'SDT1Time', 'PROD1Time', 'TASKDIF1Time', 
                   'MANI1Time', 'E2Time', 'JC2Time', 'SE2Time', 'SDT2Time', 'PROD2Time', 
                   'TASKDIF2Time', 'MANI2Time', 'datestamp', 'startdate', 'interviewtime', 'ERK', 'TEST',
                   'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536',
                   'AGE', 'WORK', 'Gender', 'Edu', 'JC2[AC]', 'SE1', 'SE2', 'MANI1', 'MANI2', 'JC1[ICJD1]',
                   'SDT1[SDT1]', 'SDT1[SDT2]', 'SDT2[2SDT1]', 'SDT2[2SDT2]', 'PROD1[SQ001]', 'TASKDIF1[SQ001]', 'PROD2[SQ001]','TASKDIF2[SQ001]'
                   ]
df.drop(columns=columns_to_drop, inplace=True)

In [51]:
# Convert Likert scale responses to numerical codes
likert_mapping = {
    "Trifft gar nicht zu": 1,
    "Trifft eher nicht zu": 2,
    "Teils, teils": 3,
    "Trifft teilweise zu": 4,
    "Trifft voll zu": 5,
    "Gar nicht zufriedenstellend": 1,
    "Eher nicht zufriedenstellend": 2,
    "Eher zufriedenstellend": 4,
    "Voll zufriedenstellend": 5,
    "Extrem schwierig": 1,
    "Eher schwierig": 2,
    "Eher leicht": 4,
    "Extrem leicht": 5
}

In [52]:
# Apply mapping to all columns where conversion is needed.
for column in df.select_dtypes(include='object').columns:  #Only object-type columns need conversion
    if df[column].isin(likert_mapping.keys()).any():
        df[column] = df[column].map(likert_mapping)

In [53]:
# Identifying rows with missing values and the specific columns affected (Problem: For JC1 it is possible to select "Keine Antwort". This is not possible for JC2)
# If no VPNCodes are listed, everything is fine otherwise data is missing
missing_info = df[df.isnull().any(axis=1)]
for index, row in missing_info.iterrows():
    missing_columns = row[row.isnull()].index.tolist()
    print(f"VPNCode: {row['VPNCode']}, Missing in columns: {missing_columns}")

VPNCode: OBA0612, Missing in columns: ['JC1[IStR2]', 'JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']


In [54]:
# After Discussion with Eva: For my MA I don't want to loose any data. So instead of dropping the row, I filled it in with the mean values for the corresponding construct
# JC1[IStR2] will be filled with 4.25 / JC1[HRJD1], JC1[HRJD2], and JC1[HRJD3] with 4

# Fill JC1[IStR2] column with 4.25 
df.loc[df['VPNCode'] == 'OBA0612', 'JC1[IStR2]'] = 4.25

# Fill JC1[HRJD1], JC1[HRJD2], and JC1[HRJD3], JC1[HRJD6] with 4
df.loc[df['VPNCode'] == 'OBA0612', ['JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']] = 4

In [55]:
# AI Literacy Constructs
df['AILiteracy[Use]'] = df[['AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]']].mean(axis=1)
df['AILiteracy[Kno]'] = df[['AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]']].mean(axis=1)
df['AILiteracy[Det]'] = df[['AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]']].mean(axis=1)
df['AILiteracy[Eth]'] = df[['AILiteracy[Eth1]', 'AILiteracy[Eth3]']].mean(axis=1)

In [56]:
# General Attitudes towards AI Constructs
df['PGAT'] = df[['PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]']].mean(axis=1)
df['NGAT'] = df[['NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]']].mean(axis=1)

In [57]:
# Common Method Bias Construct
df['CMV'] = df[['CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]']].mean(axis=1)

In [58]:
# Big Five Personality Traits (NEO) Constructs
df['NEO[E]'] = df[['NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]']].mean(axis=1)
df['NEO[A]'] = df[['NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]']].mean(axis=1)
df['NEO[C]'] = df[['NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]']].mean(axis=1)
df['NEO[N]'] = df[['NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]']].mean(axis=1)
df['NEO[O]'] = df[['NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]']].mean(axis=1)

In [59]:
# Job Crafting Constructs
df['JC1[IStR]'] = df[['JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]']].mean(axis=1)
df['JC1[HRJD]'] = df[['JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']].mean(axis=1)
df['JC2[IStR]'] = df[['JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]']].mean(axis=1)
df['JC2[HRJD]'] = df[['JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]']].mean(axis=1)

In [60]:
# Display the dataset
df.head()

Unnamed: 0,VPNCode,AILiteracy[Use1],AILiteracy[Use2],AILiteracy[Use3],AILiteracy[Use4],AILiteracy[Use5],AILiteracy[Use6],AILiteracy[Kno1],AILiteracy[Kno2],AILiteracy[Kno3],...,CMV,NEO[E],NEO[A],NEO[C],NEO[N],NEO[O],JC1[IStR],JC1[HRJD],JC2[IStR],JC2[HRJD]
0,AHO1006,4,5,4,5,5,5,5,4,5,...,4.666667,3.0,4.0,3.75,1.75,3.4,2.4,2.333333,2.8,5.0
1,AWA2505,4,4,3,4,4,4,2,4,2,...,4.0,2.5,2.5,3.5,2.75,4.0,3.4,2.0,3.2,4.5
2,EZA0703,4,5,5,5,3,3,4,3,4,...,4.333333,3.25,3.0,3.5,3.0,3.8,4.2,4.333333,3.8,2.0
3,AGU2207,5,4,4,4,4,4,4,2,3,...,4.666667,3.0,2.75,3.25,3.75,4.0,3.6,3.666667,3.0,2.25
4,NPR1810,4,4,3,4,3,3,4,4,2,...,1.666667,3.25,3.75,4.0,2.75,4.0,2.8,2.666667,3.6,1.5


In [61]:
# Save DF in a new CSV-File 
df.to_csv('data_prep_validation2_pre-merge_survey.csv', index=False, encoding='utf-8-sig', sep=',')