# Merging all the Data from Lab and from Prolific

v2_18.04.2024

In [1]:
import pandas as pd

In [2]:
# Load the datasets
df1 = pd.read_csv('data_prep_validation_lab.csv')
df2 = pd.read_csv('data_prep_validation_prolific.csv')

In [3]:
# Identify all unique columns across both datasets
all_columns = sorted(set(df1.columns).union(set(df2.columns)))

In [4]:
# Add missing columns to each dataframe with filler values where necessary
for col in all_columns:
    if col not in df1.columns:
        df1[col] = 0  
    if col not in df2.columns:
        df2[col] = 0

In [5]:
# Reorder columns to match across both dataframes, using the sorted list of all unique columns
df1 = df1.reindex(columns=all_columns)
df2 = df2.reindex(columns=all_columns)

In [6]:
# Concatenate the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

In [7]:
# Fill any other missing values that might still exist after alignment with 0 (MAYBE TEST FIRST WITH SOME CLEAR VALUE)
merged_df.fillna(0, inplace=True)

In [8]:
# After data was cleaned and ordered, the MANI-Checks were proofread again. All the following ID's used GPT twice (either bc of a technical problem or bc they thought they needed to use it on the browser) or had other issues

#'VPNCode' values with problems that need to be dropped
ids_to_drop = [
    'ZZKO1608', 'ZZSSA0301', 'ZZEKI2929', 'ZZEES0811', 'ZZAWA0212', 'ZZASE1301', 'ZZAGE607', 'ZZHMU0112', 'ZZOHA2303', 'ZZAWH2506', 'ZZAKY2105'
]

# Drop rows where the 'VPNCode' column is in the ids_to_drop list
merged_df = merged_df[~merged_df['VPNCode'].isin(ids_to_drop)]

In [9]:
# Look at the merged dataset
merged_df.head()

Unnamed: 0,AILiteracyDet,AILiteracyDet1,AILiteracyDet2,AILiteracyDet3,AILiteracyEth,AILiteracyEth1,AILiteracyEth3,AILiteracyKno,AILiteracyKno1,AILiteracyKno2,...,NEOV4R,NGAT,NGATNGAT1,NGATNGAT2,NGATNGAT3,PGAT,PGATPGAT1,PGATPGAT2,PGATPGAT3,VPNCode
0,4.0,4,4,4,2.5,3,2,3.333333,4,3,...,3,2.666667,2,4,2,4.0,3,5,4,ABO2606
1,3.333333,3,3,4,3.5,4,3,4.833333,5,5,...,4,3.333333,4,2,4,4.0,3,4,5,APE2704
2,3.333333,4,2,4,2.0,2,2,4.333333,5,2,...,4,2.333333,3,1,3,3.333333,4,4,2,SSG0102
3,4.666667,5,5,4,3.0,2,4,4.5,5,5,...,4,3.333333,3,2,5,3.333333,2,4,4,ECH2807
4,4.0,4,5,3,3.0,2,4,3.833333,4,4,...,5,3.333333,2,4,4,3.0,1,4,4,AST1210


In [10]:
# Save DF in a new CSV-File 
merged_df.to_csv('data_merged_all_validation.csv', index=False, encoding='utf-8-sig', sep=',')