# Pre-Processing of Data - Group without Randomization

The column names were changed in Excel (English teranslation). 
Not everything is the same as for the group with randomization:
- MANI1 is not measured
- The column JC1[ICJD2] is additonally measured

v1_23.03.2024

Cleaning and Coding

In [38]:
import pandas as pd

In [39]:
# Load dataset
df = pd.read_excel('20240308_results-JCsurvey_diff-headers-excel.xlsx')

In [40]:
# Look at the data
df.head()

Unnamed: 0,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,VPNCode,Gender,AGE,...,TASKDIF1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,2,2024-02-28 10:43:00,6,de,489428441,2024-02-28 10:08:00,2024-02-28 10:43:00,ABO2606,Männlich,22,...,,4.01,,690.58,,,,,,
1,3,2024-02-28 13:05:00,6,de,193910323,2024-02-28 12:42:00,2024-02-28 13:05:00,APE2704,Männlich,20,...,,10.81,,683.1,,,,,,
2,4,2024-02-28 14:12:00,6,de,29352144,2024-02-28 13:38:00,2024-02-28 14:12:00,SSG0102,Männlich,20,...,,894.66,,157.54,,,,,,
3,5,2024-02-28 15:32:00,6,de,1443700134,2024-02-28 14:46:00,2024-02-28 15:32:00,ECH2807,Männlich,21,...,,1174.46,,109.27,,,,,,
4,6,2024-02-29 09:15:00,6,de,2009257017,2024-02-29 08:47:00,2024-02-29 09:15:00,AST1210,Männlich,25,...,,386.73,,177.58,,,,,,


In [41]:
# Making "VPNCode" the first column
cols = ['VPNCode'] + [col for col in df if col != 'VPNCode']
df = df[cols]
df.head()

Unnamed: 0,VPNCode,id,submitdate,lastpage,startlanguage,seed,startdate,datestamp,Gender,AGE,...,TASKDIF1Time,groupTime535,E2Time,groupTime536,JC2Time,SE2Time,SDT2Time,PROD2Time,TASKDIF2Time,MANI2Time
0,ABO2606,2,2024-02-28 10:43:00,6,de,489428441,2024-02-28 10:08:00,2024-02-28 10:43:00,Männlich,22,...,,4.01,,690.58,,,,,,
1,APE2704,3,2024-02-28 13:05:00,6,de,193910323,2024-02-28 12:42:00,2024-02-28 13:05:00,Männlich,20,...,,10.81,,683.1,,,,,,
2,SSG0102,4,2024-02-28 14:12:00,6,de,29352144,2024-02-28 13:38:00,2024-02-28 14:12:00,Männlich,20,...,,894.66,,157.54,,,,,,
3,ECH2807,5,2024-02-28 15:32:00,6,de,1443700134,2024-02-28 14:46:00,2024-02-28 15:32:00,Männlich,21,...,,1174.46,,109.27,,,,,,
4,AST1210,6,2024-02-29 09:15:00,6,de,2009257017,2024-02-29 08:47:00,2024-02-29 09:15:00,Männlich,25,...,,386.73,,177.58,,,,,,


In [42]:
# Drop unuseful columns (there is a difference between first and second round)
columns_to_drop = ['id', 'submitdate', 'lastpage', 'startlanguage', 'seed', 'LinktoTool', 'E2', 
                   'VPNCodeTime', 'GenderTime', 'AGETime', 'EduTime', 'WORKTime', 'AILiteracyTime', 
                   'PGATTime', 'NGATTime', 'CMVTime', 'NEOTime', 'ERKTime', 'LinktoToolTime', 
                   'TESTTime', 'JC1Time', 'SE1Time', 'SDT1Time', 'PROD1Time', 'TASKDIF1Time', 
                   'E2Time', 'JC2Time', 'SE2Time', 'SDT2Time', 'PROD2Time', 
                   'TASKDIF2Time', 'MANI2Time']
df.drop(columns=columns_to_drop, inplace=True)

In [43]:
# Convert Likert scale responses to numerical codes
likert_mapping = {
    "Trifft gar nicht zu": 1,
    "Trifft eher nicht zu": 2,
    "Teils, teils": 3,
    "Trifft teilweise zu": 4,
    "Trifft voll zu": 5,
    "Gar nicht zufriedenstellend": 1,
    "Eher nicht zufriedenstellend": 2,
    "Eher zufriedenstellend": 4,
    "Voll zufriedenstellend": 5,
    "Extrem schwierig": 1,
    "Eher schwierig": 2,
    "Eher leicht": 4,
    "Extrem leicht": 5
}

In [44]:
# Apply mapping to all columns where this conversion is needed.
for column in df.select_dtypes(include='object').columns:  # Assuming only object-type columns need conversion
    if df[column].isin(likert_mapping.keys()).any():
        df[column] = df[column].map(likert_mapping)

In [45]:
# Create a column for total time in minutes
df['datestamp'] = pd.to_datetime(df['datestamp'])
df['startdate'] = pd.to_datetime(df['startdate'])
df['total_time'] = (df['datestamp'] - df['startdate']).dt.total_seconds() / 60

In [46]:
# Drop the columns afterwards
df.drop(columns=['datestamp', 'startdate'], inplace=True)

In [47]:
# Dividing  time-related columns by 60 to get minutes
time_columns = ['interviewtime', 'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536']
df[time_columns] = df[time_columns].div(60)


In [48]:
# Replace spaces in 'Edu' with underscores
df['Gender'] = df['Gender'].str.replace(' ', '_')

# One-hot encode 'Gender' column -> Dummy for each Gender
gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, gender_dummies], axis=1)

# Drop the original 'Gender' column as it is no longer needed
df.drop('Gender', axis=1, inplace=True)

In [49]:
# Replace spaces in 'Edu' with underscores
df['Edu'] = df['Edu'].str.replace(' ', '_')

# One-hot encode 'Edu' column -> Dummy for each Education
edu_dummies = pd.get_dummies(df['Edu'], prefix='Edu')
df = pd.concat([df, edu_dummies], axis=1)

# Drop the original 'Edu' column as it is no longer needed
df.drop('Edu', axis=1, inplace=True)

In [50]:
# Test for non-convertible values in WORK column and print affected VPNCode 
# Error because of wrong input in WORK column (someone did not put a numerical value)
# Error because of wrong input in WORK column (someone did not put an integer as value)
def check_convertible_to_float(x):
    try:
        float(x)  # Try to convert to float
        return True
    except ValueError:
        return False

# Apply the function to the WORK column to find non-convertible values
non_convertible_rows = df[~df['WORK'].replace('%', '', regex=True).apply(check_convertible_to_float)]

# Print the VPNCode of rows with non-convertible WORK values
print(non_convertible_rows['VPNCode'])


10    VBA1706
Name: VPNCode, dtype: object


In [51]:
# Locate the row with VPNCode 'VBA1706' and update the WORK column value to 15 (Reason: in between 10 and 20 as before it was 10/20)
df.loc[df['VPNCode'] == 'VBA1706', 'WORK'] = 15

# Check the update by displaying the row to ensure the change has been made
print(df[df['VPNCode'] == 'VBA1706'][['VPNCode', 'WORK']])


    VPNCode WORK
10  VBA1706   15


In [52]:
# To identify and print non-convertible values again after manual correction
non_convertible_after_correction = df[df['WORK'].isnull()]
print("Rows with non-convertible WORK values after correction:", non_convertible_after_correction['VPNCode'].tolist())

Rows with non-convertible WORK values after correction: []


In [53]:
# Convert decimal representations to whole numbers for percentages
df['WORK'] = df['WORK'].apply(lambda x: x*100 if x <= 1 else x)

# Verify the changes
print(df['WORK'])

0      25.0
1      15.0
2      20.0
3      30.0
4      60.0
5      20.0
6      50.0
7      50.0
8      60.0
9     100.0
10     15.0
11     10.0
12    100.0
Name: WORK, dtype: float64


In [54]:
# Check if ERK and TEST columns have "Ja" for all rows (not sure if they could continue the survey but check for safety)
# If all pass the two columns are dropped otherwise the VPNCodes are stated
if (df['ERK'] == "Ja").all() and (df['TEST'] == "Ja").all():
    print("Participants passed all tests")
    df.drop(columns=['ERK', 'TEST'], inplace=True)
else:
    failed_tests = df[(df['ERK'] != "Ja") | (df['TEST'] != "Ja")]['VPNCode']
    print(f"Not all participants passed the tests. Affected VPNCode rows: {failed_tests.tolist()}")

Participants passed all tests


In [55]:
# Check if all numbers in the WORK column are > 0
if (df['WORK'] <= 0).any():
    affected_vpn = df[df['WORK'] <= 0]['VPNCode']
    print(f"WORK column contains values <= 0. Affected VPNCode rows: {affected_vpn.tolist()}")
else:
    print("All WORK column numbers are > 0.")


All WORK column numbers are > 0.


In [56]:
# Manually drop the affected rows, if necessary
# df = df[df['VPNCode'] != "state all the VPNCodes"]
# df.reset_index(drop=True, inplace=True)

In [57]:
# Check if JC2[AC] column has 4 as an answer for all rows
# If this is the case the column is dropped otherwise the VPNCodes are stated
if (df['JC2[AC]'] == 4).all():
    print("Participants passed all AC tests")
    df.drop(columns=['JC2[AC]'], inplace=True)
else:
    failed_ac_test = df[df['JC2[AC]'] != 4]['VPNCode']
    print(f"Not all participants passed the AC test. Affected VPNCode rows: {failed_ac_test.tolist()}")
    

Participants passed all AC tests


In [58]:
# Manually drop the affected rows, if necessary
# df = df[df['VPNCode'] != "state all the VPNCodes"]
# df.reset_index(drop=True, inplace=True)
    
# After that, run again the cell above to drop the column 'JC2[AC]' as it is no longer needed

In [59]:
# Identifying rows with missing values and the specific columns affected (Problem: For JC1 it is possible to select "Keine Antwort". This is not possible for JC2)
# If no VPNCodes are listed, everything is fine otherwise data is missing
missing_info = df[df.isnull().any(axis=1)]
for index, row in missing_info.iterrows():
    missing_columns = row[row.isnull()].index.tolist()
    print(f"VPNCode: {row['VPNCode']}, Missing in columns: {missing_columns}")

In [60]:
# Manually drop the affected rows, if necessary (or handle the missing data in another way)
# df = df[df['VPNCode'] != "state all the VPNCodes"]
# df.reset_index(drop=True, inplace=True)

In [61]:
# Display the cleaned dataset
print(df.head())

   VPNCode  AGE  WORK  AILiteracy[Use1]  AILiteracy[Use2]  AILiteracy[Use3]  \
0  ABO2606   22  25.0                 3                 4                 3   
1  APE2704   20  15.0                 5                 5                 5   
2  SSG0102   20  20.0                 5                 4                 4   
3  ECH2807   21  30.0                 4                 4                 4   
4  AST1210   25  60.0                 5                 4                 4   

   AILiteracy[Use4]  AILiteracy[Use5]  AILiteracy[Use6]  AILiteracy[Kno1]  \
0                 3                 3                 3                 4   
1                 5                 5                 5                 5   
2                 4                 4                 2                 5   
3                 4                 4                 4                 5   
4                 4                 4                 4                 4   

   ...  groupTime534  groupTime533  groupTime535  groupTime536

In [62]:
# Save DF in a new CSV-File 
# df.to_csv('data_prep_cleaned.csv', index=False, encoding='utf-8-sig', sep=',')

Building Constructs

In [None]:
# Load dataset
# df = pd.read_csv('ddata_prep_cleaned.csv')

In [63]:
# AI Literacy Constructs
df['AILiteracy[Use]'] = df[['AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]']].mean(axis=1)
df['AILiteracy[Kno]'] = df[['AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]']].mean(axis=1)
df['AILiteracy[Det]'] = df[['AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]']].mean(axis=1)
df['AILiteracy[Eth]'] = df[['AILiteracy[Eth1]', 'AILiteracy[Eth3]']].mean(axis=1)

# Maybe create only one AILiteracy construct?

In [64]:
# General Attitudes towards AI Constructs
df['PGAT'] = df[['PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]']].mean(axis=1)
df['NGAT'] = df[['NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]']].mean(axis=1)

# Maybe create only one Attitudes towards AI construct?

In [65]:
# Common Method Bias Construct
df['CMV'] = df[['CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]']].mean(axis=1)

In [66]:
# Big Five Personality Traits (NEO) Constructs
df['NEO[E]'] = df[['NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]']].mean(axis=1)
df['NEO[A]'] = df[['NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]']].mean(axis=1)
df['NEO[C]'] = df[['NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]']].mean(axis=1)
df['NEO[N]'] = df[['NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]']].mean(axis=1)
df['NEO[O]'] = df[['NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]']].mean(axis=1)

In [67]:
# Job Crafting Constructs -> Changes necessary in comparison to second round
df['JC1[IStR]'] = df[['JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]']].mean(axis=1)
df['JC1[HRJD]'] = df[['JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]']].mean(axis=1)
df['JC1[ICJD]'] = df[['JC1[ICJD1]', 'JC1[ICJD2]']].mean(axis=1) #completely new in comparison to second round with randomization

df['JC2[IStR]'] = df[['JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]']].mean(axis=1)
df['JC2[HRJD]'] = df[['JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]']].mean(axis=1) #added JC2[2HRJD6] in comparison to second round with randomization

In [68]:
# Drop the original columns to clean up the DataFrame and reduce complexity
columns_to_drop = [
    'AILiteracy[Use1]', 'AILiteracy[Use2]', 'AILiteracy[Use3]', 'AILiteracy[Use4]', 'AILiteracy[Use5]', 'AILiteracy[Use6]',
    'AILiteracy[Kno1]', 'AILiteracy[Kno2]', 'AILiteracy[Kno3]', 'AILiteracy[Kno4]', 'AILiteracy[Kno5]', 'AILiteracy[Kno6]',
    'AILiteracy[Det1]', 'AILiteracy[Det2]', 'AILiteracy[Det3]', 
    'AILiteracy[Eth1]', 'AILiteracy[Eth3]',
    'PGAT[PGAT1]', 'PGAT[PGAT2]', 'PGAT[PGAT3]',
    'NGAT[NGAT1]', 'NGAT[NGAT2]', 'NGAT[NGAT3]',
    'CMV[SQ001]', 'CMV[SQ002]', 'CMV[SQ003]',
    'NEO[E1R]', 'NEO[E2]', 'NEO[E3R]', 'NEO[E4]',
    'NEO[V1R]', 'NEO[V2]', 'NEO[V3R]', 'NEO[V4R]',
    'NEO[G1]', 'NEO[G2R]', 'NEO[G3]', 'NEO[G4]',
    'NEO[N1]', 'NEO[N2R]', 'NEO[N3]', 'NEO[N4]',
    'NEO[O1]', 'NEO[O2]', 'NEO[O3]', 'NEO[O4]', 'NEO[O5R]',
    'JC1[IStR1]', 'JC1[IStR2]', 'JC1[IStR3]', 'JC1[IStR4]', 'JC1[IStR5]',
    'JC1[HRJD1]', 'JC1[HRJD2]', 'JC1[HRJD3]',
    'JC1[ICJD1]', 'JC1[ICJD2]', #added both
    'JC2[2IStR1]', 'JC2[2IStR2]', 'JC2[2IStR3]', 'JC2[2IStR4]', 'JC2[2IStR5]',
    'JC2[2HRJD1]', 'JC2[2HRJD2]', 'JC2[2HRJD5]', 'JC2[2HRJD6]' #added JC2[2HRJD6] 
]

df.drop(columns=columns_to_drop, inplace=True)

In [69]:
# Renaming rest of the columns
df.rename(columns={
    'SDT1[SDT1]': 'SDT1[AUT]',
    'SDT1[SDT2]': 'SDT1[COM]',
    'SDT2[2SDT1]': 'SDT2[AUT]',
    'SDT2[2SDT2]': 'SDT2[COM]'
}, inplace=True)

In [70]:
# Show all the columns in the dataset as list -> Quick check
pd.set_option('display.max_columns', None)
print(df.columns.tolist())

['VPNCode', 'AGE', 'WORK', 'SE1', 'SDT1[AUT]', 'SDT1[COM]', 'PROD1[SQ001]', 'TASKDIF1[SQ001]', 'SE2', 'SDT2[AUT]', 'SDT2[COM]', 'PROD2[SQ001]', 'TASKDIF2[SQ001]', 'MANI2', 'interviewtime', 'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536', 'total_time', 'Gender_Männlich', 'Gender_Weiblich', 'Edu_Abgeschlossene_Berufsausbildung', 'Edu_Abitur_oder_Fachabitur', 'Edu_Bachelor', 'AILiteracy[Use]', 'AILiteracy[Kno]', 'AILiteracy[Det]', 'AILiteracy[Eth]', 'PGAT', 'NGAT', 'CMV', 'NEO[E]', 'NEO[A]', 'NEO[C]', 'NEO[N]', 'NEO[O]', 'JC1[IStR]', 'JC1[HRJD]', 'JC1[ICJD]', 'JC2[IStR]', 'JC2[HRJD]']


In [71]:
# Specifying the desired order of columns
# If any are missing or named differently, adjust the list accordingly.
new_column_order = [
    'VPNCode', 'AGE', 'WORK', 
    'Gender_Männlich', 'Gender_Weiblich', 'Edu_Abitur_oder_Fachabitur', 'Edu_Bachelor',
    'AILiteracy[Use]', 'AILiteracy[Kno]', 'AILiteracy[Det]', 'AILiteracy[Eth]',
    'PGAT', 'NGAT', 'CMV', 'NEO[E]', 'NEO[A]', 'NEO[C]', 'NEO[N]', 'NEO[O]',
    'JC1[IStR]', 'JC1[HRJD]', 'JC1[ICJD]', 'SE1', 'SDT1[AUT]', 'SDT1[COM]', 'PROD1[SQ001]', 'TASKDIF1[SQ001]',
    'JC2[IStR]', 'JC2[HRJD]', 'SE2', 'SDT2[AUT]', 'SDT2[COM]', 'PROD2[SQ001]', 'TASKDIF2[SQ001]', 'MANI2',
    'total_time', 'interviewtime', 'groupTime531', 'groupTime532', 'groupTime534', 'groupTime533', 'groupTime535', 'groupTime536'
]

df = df[new_column_order]

In [72]:
# Renaming time columns (taken from the German translation)
df.rename(columns={
    'interviewtime': 'syst_total_time',
    'groupTime531': 'time_demogr',
    'groupTime532': 'time_pers',
    'groupTime534': 'time_task1',
    'groupTime533': 'time_survey1',
    'groupTime535': 'time_task2',
    'groupTime536': 'time_survey1'
}, inplace=True)

In [73]:
# Display the dataset
print(df.head())

   VPNCode  AGE  WORK  Gender_Männlich  Gender_Weiblich  \
0  ABO2606   22  25.0                1                0   
1  APE2704   20  15.0                1                0   
2  SSG0102   20  20.0                1                0   
3  ECH2807   21  30.0                1                0   
4  AST1210   25  60.0                1                0   

   Edu_Abitur_oder_Fachabitur  Edu_Bachelor  AILiteracy[Use]  AILiteracy[Kno]  \
0                           0             1         3.166667         3.333333   
1                           1             0         5.000000         4.833333   
2                           1             0         3.833333         4.333333   
3                           1             0         4.000000         4.500000   
4                           0             0         4.166667         3.833333   

   AILiteracy[Det]  AILiteracy[Eth]      PGAT      NGAT       CMV  NEO[E]  \
0         4.000000              2.5  4.000000  2.666667  4.666667    2.75   
1   

In [74]:
# Save DF in a new CSV-File 
df.to_csv('data_prep_constructs.csv', index=False, encoding='utf-8-sig', sep=',')