# Merging all the Data from Lab and from Prolific

v2_18.04.2024

In [1]:
import pandas as pd

In [2]:
# Load the datasets
df1 = pd.read_csv('data_prep_analysis_lab.csv')
df2 = pd.read_csv('data_prep_analysis_prolific.csv')

In [3]:
# Identify all unique columns across both datasets
all_columns = sorted(set(df1.columns).union(set(df2.columns)))

In [4]:
# Add missing columns to each dataframe with filler values where necessary
for col in all_columns:
    if col not in df1.columns:
        df1[col] = 0  
    if col not in df2.columns:
        df2[col] = 0

In [5]:
# Reorder columns to match across both dataframes, using the sorted list of all unique columns
df1 = df1.reindex(columns=all_columns)
df2 = df2.reindex(columns=all_columns)

In [6]:
# Concatenate the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

In [7]:
# Print the columns with missing values and the count of missing data in each column
missing_values = merged_df.isnull().sum()
print(missing_values[missing_values > 0])

MANI2    1
dtype: int64


In [8]:
# Fill missing values
merged_df.fillna(0, inplace=True)

In [9]:
# Print the list of column names
column_names = merged_df.columns.tolist()
print(column_names)

['AGE', 'AILiteracyDet', 'AILiteracyEth', 'AILiteracyKno', 'AILiteracyUse', 'All_Prompts', 'Average_Word_Count_Prompts', 'CMV', 'Edu_Abgeschlossene_Berufsausbildung', 'Edu_Abitur_oder_Fachabitur', 'Edu_Bachelor', 'Edu_Doktortitel', 'Edu_Master', 'Edu_Mittlere_Reife_Realschulabschluss', 'First_Prompt_Word_Count', 'Gender_Männlich', 'Gender_Weiblich', 'Group_Nr', 'ID', 'JC1HRJD', 'JC1IStR', 'JC2HRJD', 'JC2IStR', 'MANI1', 'MANI2', 'NEOA', 'NEOC', 'NEOE', 'NEON', 'NEOO', 'NGAT', 'PGAT', 'PROD1', 'PROD2', 'Prompt_Count', 'Prompt_Support', 'SDT1AUT', 'SDT1COM', 'SDT2AUT', 'SDT2COM', 'SE1', 'SE2', 'TASKDIF1', 'TASKDIF2', 'Total_Word_Count_Prompt', 'WORK', 'time_demogr', 'time_pers', 'time_survey1', 'time_survey2', 'time_task1', 'time_task2', 'time_total']


In [10]:
# Change order of columns to a more desired one in order to get a better overview
new_column_order = [
    'ID', 'AGE', 'WORK', 'Gender_Männlich', 'Gender_Weiblich', 
    'Edu_Abgeschlossene_Berufsausbildung', 'Edu_Mittlere_Reife_Realschulabschluss', 'Edu_Abitur_oder_Fachabitur', 'Edu_Bachelor', 'Edu_Master', 'Edu_Doktortitel',
    'AILiteracyUse', 'AILiteracyKno', 'AILiteracyDet', 'AILiteracyEth', 'PGAT', 'NGAT', 'CMV', 
    'NEOE', 'NEOA', 'NEOC', 'NEON', 'NEOO', 'Group_Nr', 'Prompt_Support',
    'JC1IStR', 'JC1HRJD', 'SE1', 'SDT1AUT', 'SDT1COM', 'PROD1', 'TASKDIF1', 'MANI1',
    'JC2IStR', 'JC2HRJD', 'SE2', 'SDT2AUT', 'SDT2COM', 'PROD2', 'TASKDIF2', 'MANI2', 
    'All_Prompts', 'Prompt_Count', 'Average_Word_Count_Prompts', 'First_Prompt_Word_Count', 'Total_Word_Count_Prompt',
    'time_total', 'time_demogr', 'time_pers', 'time_task1', 'time_survey1', 'time_task2', 'time_survey2'
]

merged_df = merged_df[new_column_order]

In [11]:
# Define the columns where replacements are needed
columns_to_replace = [
    'Gender_Männlich', 'Gender_Weiblich', 
    'Edu_Abgeschlossene_Berufsausbildung', 'Edu_Mittlere_Reife_Realschulabschluss', 'Edu_Abitur_oder_Fachabitur', 'Edu_Bachelor', 'Edu_Master', 'Edu_Doktortitel'
]

# Convert columns to integer type to ensure consistency
for column in columns_to_replace:
    merged_df[column] = merged_df[column].astype(int)

# Replace "false" with 0 and "true" with 1 in these columns
for column in columns_to_replace:
    merged_df[column] = merged_df[column].replace({"false": 0, "true": 1})

In [12]:
# After data was cleaned and ordered, the MANI-Checks were proofread again. All the following ID's used GPT twice (either bc of a technical problem or bc they thought they needed to use it on the browser) or had other issues

#'ID' values with problems that need to be dropped
ids_to_drop = [
    'ZZKO1608', 'ZZSSA0301', 'ZZEKI2929', 'ZZEES0811', 'ZZAWA0212', 'ZZASE1301', 'ZZAGE607', 'ZZHMU0112', 'ZZOHA2303', 'ZZAWH2506', 'ZZAKY2105'
]

# Drop rows where the 'id' column is in the ids_to_drop list
merged_df = merged_df[~merged_df['ID'].isin(ids_to_drop)]

In [16]:
# Look at the merged dataset
merged_df.head()

Unnamed: 0,ID,AGE,WORK,Gender_Männlich,Gender_Weiblich,Edu_Abgeschlossene_Berufsausbildung,Edu_Mittlere_Reife_Realschulabschluss,Edu_Abitur_oder_Fachabitur,Edu_Bachelor,Edu_Master,...,Average_Word_Count_Prompts,First_Prompt_Word_Count,Total_Word_Count_Prompt,time_total,time_demogr,time_pers,time_task1,time_survey1,time_task2,time_survey2
0,ABO2606,22.0,25.0,1,0,0,0,0,1,0,...,39.333333,100.0,118.0,35.038667,9.134167,1.056333,11.980833,1.290833,11.509667,0.066833
1,APE2704,20.0,15.0,1,0,0,0,1,0,0,...,108.75,128.0,435.0,23.169333,2.530833,1.257667,6.3725,1.443167,11.385,0.180167
2,SSG0102,20.0,20.0,1,0,0,0,1,0,0,...,11.571429,5.0,81.0,34.525167,3.595667,1.3015,10.433,1.658333,14.911,2.625667
3,ECH2807,21.0,30.0,1,0,0,0,1,0,0,...,55.75,57.0,223.0,46.279833,1.717167,0.610833,21.664167,0.892167,19.574333,1.821167
4,AST1210,25.0,60.0,1,0,0,0,0,0,0,...,20.5,27.0,41.0,28.102,3.470833,1.178833,12.412667,1.6345,6.4455,2.959667


In [17]:
# Save DF in a new CSV-File 
merged_df.to_csv('data_merged_all_analysis.csv', index=False, encoding='utf-8-sig', sep=',')