# Merging all the Data from the Groups with and without Randomization

v1_12.04.2024

In [1]:
import pandas as pd

In [8]:
# Load the datasets
df1 = pd.read_csv('data_prep_validation1.csv')
df2 = pd.read_csv('data_prep_validation2.csv')

In [9]:
# Identify all unique columns across both datasets
all_columns = sorted(set(df1.columns).union(set(df2.columns)))

In [10]:
# Add missing columns to each dataframe with filler values where necessary
for col in all_columns:
    if col not in df1.columns:
        df1[col] = 0  
    if col not in df2.columns:
        df2[col] = 0

In [11]:
# Reorder columns to match across both dataframes, using the sorted list of all unique columns
df1 = df1.reindex(columns=all_columns)
df2 = df2.reindex(columns=all_columns)

In [12]:
# Concatenate the datasets
merged_df = pd.concat([df1, df2], ignore_index=True)

In [14]:
# Fill any other missing values that might still exist after alignment with 0 (MAYBE TEST FIRST WITH SOME CLEAR VALUE)
merged_df.fillna(0, inplace=True)

In [16]:
# Look at the merged dataset
merged_df.head()

Unnamed: 0,AILiteracy[Det1],AILiteracy[Det2],AILiteracy[Det3],AILiteracy[Det],AILiteracy[Eth1],AILiteracy[Eth3],AILiteracy[Eth],AILiteracy[Kno1],AILiteracy[Kno2],AILiteracy[Kno3],...,NGAT,NGAT[NGAT1],NGAT[NGAT2],NGAT[NGAT3],PGAT,PGAT[PGAT1],PGAT[PGAT2],PGAT[PGAT3],VPNCode,group_nr
0,4,4,4,4.0,3,2,2.5,4,3,3,...,2.666667,2,4,2,4.0,3,5,4,ABO2606,1
1,3,3,4,3.333333,4,3,3.5,5,5,4,...,3.333333,4,2,4,4.0,3,4,5,APE2704,3
2,4,2,4,3.333333,2,2,2.0,5,2,4,...,2.333333,3,1,3,3.333333,4,4,2,SSG0102,3
3,5,5,4,4.666667,2,4,3.0,5,5,4,...,3.333333,3,2,5,3.333333,2,4,4,ECH2807,4
4,4,5,3,4.0,2,4,3.0,4,4,2,...,3.333333,2,4,4,3.0,1,4,4,AST1210,3
5,2,2,4,2.666667,3,1,2.0,3,1,4,...,2.0,2,1,3,5.0,5,5,5,IHE1103,2
6,4,3,2,3.0,4,4,4.0,2,4,5,...,3.333333,3,4,3,2.333333,1,2,4,OME0709,4
7,4,4,4,4.0,3,2,2.5,4,4,3,...,2.0,2,2,2,4.333333,4,4,5,NBA0408,2
8,4,3,4,3.666667,3,4,3.5,3,4,3,...,3.666667,4,4,3,4.333333,4,4,5,AOD2103,1
9,4,3,5,4.0,3,4,3.5,3,4,4,...,3.0,3,4,2,2.0,1,2,3,IME0910,4


In [17]:
# Adress Radomization by exchaning values from the following columns if the 'group_nr' values are 5, 6, 7, and 8 (from 1 to 4 the order is already correct)
# Define the columns to swap in pairs
swap_pairs = [
    ('JC1[IStR1]', 'JC2[2IStR1]'),
    ('JC1[IStR2]', 'JC2[2IStR2]'),
    ('JC1[IStR3]', 'JC2[2IStR3]'),
    ('JC1[IStR4]', 'JC2[2IStR4]'),
    ('JC1[IStR5]', 'JC2[2IStR5]'),
    ('JC1[HRJD1]', 'JC2[2HRJD1]'),
    ('JC1[HRJD2]', 'JC2[2HRJD2]'),
    ('JC1[HRJD3]', 'JC2[2HRJD5]'),
    ('JC1[HRJD]', 'JC2[HRJD]'),
    ('JC1[IStR]', 'JC2[IStR]'),
]

# Loop through each pair and swap their values for rows with group_nr in 5, 6, 7, 8
for col1, col2 in swap_pairs:
    # Identify rows where swapping is needed
    mask = merged_df['group_nr'].isin([5, 6, 7, 8])
    
    # Perform the swap
    merged_df.loc[mask, [col1, col2]] = merged_df.loc[mask, [col2, col1]].values

In [18]:
# Look at the new dataset
merged_df

Unnamed: 0,AILiteracy[Det1],AILiteracy[Det2],AILiteracy[Det3],AILiteracy[Det],AILiteracy[Eth1],AILiteracy[Eth3],AILiteracy[Eth],AILiteracy[Kno1],AILiteracy[Kno2],AILiteracy[Kno3],...,NGAT,NGAT[NGAT1],NGAT[NGAT2],NGAT[NGAT3],PGAT,PGAT[PGAT1],PGAT[PGAT2],PGAT[PGAT3],VPNCode,group_nr
0,4,4,4,4.0,3,2,2.5,4,3,3,...,2.666667,2,4,2,4.0,3,5,4,ABO2606,1
1,3,3,4,3.333333,4,3,3.5,5,5,4,...,3.333333,4,2,4,4.0,3,4,5,APE2704,3
2,4,2,4,3.333333,2,2,2.0,5,2,4,...,2.333333,3,1,3,3.333333,4,4,2,SSG0102,3
3,5,5,4,4.666667,2,4,3.0,5,5,4,...,3.333333,3,2,5,3.333333,2,4,4,ECH2807,4
4,4,5,3,4.0,2,4,3.0,4,4,2,...,3.333333,2,4,4,3.0,1,4,4,AST1210,3
5,2,2,4,2.666667,3,1,2.0,3,1,4,...,2.0,2,1,3,5.0,5,5,5,IHE1103,2
6,4,3,2,3.0,4,4,4.0,2,4,5,...,3.333333,3,4,3,2.333333,1,2,4,OME0709,4
7,4,4,4,4.0,3,2,2.5,4,4,3,...,2.0,2,2,2,4.333333,4,4,5,NBA0408,2
8,4,3,4,3.666667,3,4,3.5,3,4,3,...,3.666667,4,4,3,4.333333,4,4,5,AOD2103,1
9,4,3,5,4.0,3,4,3.5,3,4,4,...,3.0,3,4,2,2.0,1,2,3,IME0910,4


In [19]:
# Drop group_nr as it is no longer needed
merged_df.drop(columns=['group_nr'], inplace=True)

In [20]:
# Renaming columns to remove brackets
merged_df.columns = [col.replace('[', '').replace(']', '') for col in merged_df.columns]

In [21]:
# Save DF in a new CSV-File 
merged_df.to_csv('data_merged_all_validation.csv', index=False, encoding='utf-8-sig', sep=',')