# Pre-Processing of Data - Group without Randomization

Data for the Validation of Constructs

v1_12.04.2024

Cleaning and Coding

In [1]:
import pandas as pd

In [2]:
# Load dataset
df = pd.read_excel('20240308_results-JCsurvey_diff-headers-excel.xlsx')

In [3]:
# Look at the data
df.head()

Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,VPNCode,Gender,AGE,...,TASKDIF1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,2,2024-02-28 10:43:00,6,de,489428441,2024-02-28 10:08:00,2024-02-28 10:43:00,ABO2606,Männlich,22,...,,4.01,,690.58,,,,,,
1,3,2024-02-28 13:05:00,6,de,193910323,2024-02-28 12:42:00,2024-02-28 13:05:00,APE2704,Männlich,20,...,,10.81,,683.1,,,,,,
2,4,2024-02-28 14:12:00,6,de,29352144,2024-02-28 13:38:00,2024-02-28 14:12:00,SSG0102,Männlich,20,...,,894.66,,157.54,,,,,,
3,5,2024-02-28 15:32:00,6,de,1443700134,2024-02-28 14:46:00,2024-02-28 15:32:00,ECH2807,Männlich,21,...,,1174.46,,109.27,,,,,,
4,6,2024-02-29 09:15:00,6,de,2009257017,2024-02-29 08:47:00,2024-02-29 09:15:00,AST1210,Männlich,25,...,,386.73,,177.58,,,,,,


In [4]:
# Making "VPNCode" the first column
cols = ['VPNCode'] + [col for col in df if col != 'VPNCode']
df = df[cols]
df.head()

Unnamed: 0,VPNCode,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,Gender,AGE,...,TASKDIF1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,ABO2606,2,2024-02-28 10:43:00,6,de,489428441,2024-02-28 10:08:00,2024-02-28 10:43:00,Männlich,22,...,,4.01,,690.58,,,,,,
1,APE2704,3,2024-02-28 13:05:00,6,de,193910323,2024-02-28 12:42:00,2024-02-28 13:05:00,Männlich,20,...,,10.81,,683.1,,,,,,
2,SSG0102,4,2024-02-28 14:12:00,6,de,29352144,2024-02-28 13:38:00,2024-02-28 14:12:00,Männlich,20,...,,894.66,,157.54,,,,,,
3,ECH2807,5,2024-02-28 15:32:00,6,de,1443700134,2024-02-28 14:46:00,2024-02-28 15:32:00,Männlich,21,...,,1174.46,,109.27,,,,,,
4,AST1210,6,2024-02-29 09:15:00,6,de,2009257017,2024-02-29 08:47:00,2024-02-29 09:15:00,Männlich,25,...,,386.73,,177.58,,,,,,


In [5]:
# Drop unuseful columns
columns_to_drop = ['id', 'submitdate', 'lastpage', 'startlanguage', 'seed', 'LinktoTool', 'E2', 
                   'VPNCodeTime', 'GenderTime', 'AGETime', 'EduTime', 'WORKTime', 'AILiteracyTime', 
                   'PGATTime', 'NGATTime', 'CMVTime', 'NEOTime', 'ERKTime', 'LinktoToolTime', 
                   'TESTTime', 'JC1Time', 'SE1Time', 'SDT1Time', 'PROD1Time', 'TASKDIF1Time', 
                   'E2Time', 'JC2Time', 'SE2Time', 'SDT2Time', 'PROD2Time', 
                   'TASKDIF2Time', 'MANI2Time', 'datestamp', 'startdate', 'interviewtime', 'ERK', 'TEST',
                   'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536',
                   'AGE', 'WORK', 'Gender', 'Edu', 'JC2[AC]', 'SE1', 'SE2', 'MANI2', 
                   'SDT1[SDT1]', 'SDT1[SDT2]', 'SDT2[2SDT1]', 'SDT2[2SDT2]', 'PROD1[SQ001]', 'TASKDIF1[SQ001]', 'PROD2[SQ001]','TASKDIF2[SQ001]',
                   'JC1[ICJD1]', 'JC1[ICJD2]'
                   ]
df.drop(columns=columns_to_drop, inplace=True)

In [6]:
# Convert Likert scale responses to numerical codes
likert_mapping = {
    "Trifft gar nicht zu": 1,
    "Trifft eher nicht zu": 2,
    "Teils, teils": 3,
    "Trifft teilweise zu": 4,
    "Trifft voll zu": 5,
    "Gar nicht zufriedenstellend": 1,
    "Eher nicht zufriedenstellend": 2,
    "Eher zufriedenstellend": 4,
    "Voll zufriedenstellend": 5,
    "Extrem schwierig": 1,
    "Eher schwierig": 2,
    "Eher leicht": 4,
    "Extrem leicht": 5
}

In [7]:
# Apply mapping to all columns where this conversion is needed.
for column in df.select_dtypes(include='object').columns:  # only object-type columns need conversion
    if df[column].isin(likert_mapping.keys()).any():
        df[column] = df[column].map(likert_mapping)

In [8]:
# AI Literacy Constructs
df['AILiteracy[Use]'] = df[['AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]']].mean(axis=1)
df['AILiteracy[Kno]'] = df[['AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]']].mean(axis=1)
df['AILiteracy[Det]'] = df[['AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]']].mean(axis=1)
df['AILiteracy[Eth]'] = df[['AILiteracy[Eth1]', 'AILiteracy[Eth3]']].mean(axis=1)

In [9]:
# General Attitudes towards AI Constructs
df['PGAT'] = df[['PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]']].mean(axis=1)
df['NGAT'] = df[['NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]']].mean(axis=1)

In [10]:
# Common Method Bias Construct
df['CMV'] = df[['CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]']].mean(axis=1)

In [11]:
# Big Five Personality Traits (NEO) Constructs
df['NEO[E]'] = df[['NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]']].mean(axis=1)
df['NEO[A]'] = df[['NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]']].mean(axis=1)
df['NEO[C]'] = df[['NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]']].mean(axis=1)
df['NEO[N]'] = df[['NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]']].mean(axis=1)
df['NEO[O]'] = df[['NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]']].mean(axis=1)

In [12]:
# Job Crafting Constructs
df['JC1[IStR]'] = df[['JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]']].mean(axis=1)
df['JC1[HRJD]'] = df[['JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']].mean(axis=1)
df['JC2[IStR]'] = df[['JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]']].mean(axis=1)
df['JC2[HRJD]'] = df[['JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]']].mean(axis=1)

In [13]:
# Display the dataset
df.head()

Unnamed: 0,VPNCode,AILiteracy[Use1],AILiteracy[Use2],AILiteracy[Use3],AILiteracy[Use4],AILiteracy[Use5],AILiteracy[Use6],AILiteracy[Kno1],AILiteracy[Kno2],AILiteracy[Kno3],...,CMV,NEO[E],NEO[A],NEO[C],NEO[N],NEO[O],JC1[IStR],JC1[HRJD],JC2[IStR],JC2[HRJD]
0,ABO2606,3,4,3,3,3,3,4,3,3,...,4.666667,2.75,3.0,3.75,2.5,3.6,2.8,2.333333,2.8,4.0
1,APE2704,5,5,5,5,5,5,5,5,4,...,2.0,2.25,3.25,4.0,2.25,3.6,3.0,2.333333,4.6,4.0
2,SSG0102,5,4,4,4,4,2,5,2,4,...,3.666667,3.75,3.5,3.5,2.25,4.0,3.8,3.666667,5.0,2.75
3,ECH2807,4,4,4,4,4,4,5,5,4,...,4.0,3.0,3.5,3.25,2.5,4.0,3.4,3.333333,3.8,3.75
4,AST1210,5,4,4,4,4,4,4,4,2,...,4.333333,3.0,4.0,3.75,3.0,4.2,4.4,3.666667,3.2,5.0


In [14]:
# Save DF in a new CSV-File 
df.to_csv('data_prep_validation1_pre-merge_survey.csv', index=False, encoding='utf-8-sig', sep=',')