# Data cleaning
This script cleans the data to prepare it for the quantitative and qualitative analyses.

In [306]:
import pandas as pd
import os
import yaml

In [307]:
# vars
attitudes = ["Interest", "Value", "Self-efficacy", "Responsibility"]
majors = ["CS/SWE major", "CS/SWE minor", "Other"]

Data is stored in the `\data\` directory. 

In [308]:
data_dir = os.path.join(os.getcwd(), 'data')
ee_file = "F2025_ee.csv"
cg_file = "F2025_cg.csv"
col_file = "F2025_cols.yaml"

ee_data_path = os.path.join(data_dir, ee_file)
cg_data_path = os.path.join(data_dir, cg_file)

In [309]:
ee_data = pd.read_csv(ee_data_path)
cg_data = pd.read_csv(cg_data_path)

### Remove and reformat columns
Remove strange no break space characters from column names

In [310]:
ee_data.columns = ee_data.columns.str.replace("\u00A0", " ", regex=False)
cg_data.columns = cg_data.columns.str.replace("\u00A0", " ", regex=False)

Remove useless columns since survey is anonymous.
- Name
- Email

In [311]:
ee_data.drop(columns=["Email", "Name"], inplace=True)
cg_data.drop(columns=["Email", "Name"], inplace=True)

Give shorter descriptive names to columns. The new column names are stored in the YAML file, which can be modified if the survey questions change, for reusability purposes. Opening this file creates a dictionary mapping the old column names to the new and shortened ones. 

In [312]:
with open(os.path.join(data_dir, col_file), 'r') as yaml_file:
    new_cols = yaml.safe_load(yaml_file)

ee_data.rename(columns=new_cols, inplace=True)
cg_data.rename(columns=new_cols, inplace=True)

### Merge the year columns into one single column

In [313]:
ee_data["Year undergraduate"] = ee_data["Year undergraduate"].fillna(ee_data["Year graduate"])
ee_data.drop(columns=["Year graduate"], inplace=True)
ee_data.rename(columns={"Year undergraduate": "Year"}, inplace=True)

cg_data["Year undergraduate"] = cg_data["Year undergraduate"].fillna(cg_data["Year graduate"])
cg_data.drop(columns=["Year graduate"], inplace=True)
cg_data.rename(columns={"Year undergraduate": "Year"}, inplace=True)

### Format multiselect questions
Multiselect questions are one-hot encoded, since someone can select multiple answers.

#### Major

In [314]:
ohe_major_ee = ee_data["Major"].str.get_dummies(sep=";")
ohe_major_cg = cg_data["Major"].str.get_dummies(sep=";")

In [315]:
ee_data["CS/SWE major"] = ohe_major_ee["Computer Science or Software Engineering major"]
ee_data["CS/SWE minor"] = ohe_major_ee["Computer Science or Software Engineering minor"]
ee_data["Other"] = ohe_major_ee.drop(columns=["Computer Science or Software Engineering major", "Computer Science or Software Engineering minor"]).sum(axis=1)
ee_data.drop(columns=["Major"], inplace=True) # remove this line if we want to know exactly what the other majors are

cg_data["CS/SWE major"] = ohe_major_cg["Computer Science or Software Engineering major"]
cg_data["CS/SWE minor"] = ohe_major_cg["Computer Science or Software Engineering minor"]
cg_data["Other"] = ohe_major_cg.drop(columns=["Computer Science or Software Engineering major", "Computer Science or Software Engineering minor"]).sum(axis=1)
cg_data.drop(columns=["Major"], inplace=True) # remove this line if we want to know exactly what the other majors are

#### EE course

In [316]:
ohe_eecourse_ee = ee_data["EE course"].str.get_dummies(sep=";")
ohe_eecourse_cg = cg_data["EE course"].str.get_dummies(sep=";")

eecourse_cols = list(ohe_eecourse_ee.columns) # this is the list of all EE courses
eecourse_cols_new = {}

# shorten the course column names in a general way
for col in eecourse_cols:
    course = col.split()[0]
    if "COMP" not in course:
        eecourse_cols_new[col] = "None"
    else:
        eecourse_cols_new[col] = course

ohe_eecourse_ee.rename(columns=eecourse_cols_new, inplace=True)
ohe_eecourse_cg.rename(columns=eecourse_cols_new, inplace=True)

In [317]:
for col in ohe_eecourse_ee.columns:
    ee_data[col] = ohe_eecourse_ee[col]
ee_data.drop(columns=["EE course"], inplace=True)

for col in ohe_eecourse_cg.columns:
    cg_data[col] = ohe_eecourse_cg[col]
cg_data.drop(columns=["EE course"], inplace=True)

#### CG course

In [318]:
ohe_cgcourse = cg_data["CG course"].str.get_dummies(sep=";")

for col in ohe_cgcourse.columns:
    cg_data[col] = ohe_cgcourse[col]
cg_data.drop(columns=["CG course"], inplace=True)

### Reorganize the data columns
- Entries will have a hierarchy for easier access of data and generalizability

In [319]:
multi_cols_ee = []

for col in ee_data.columns:
    
    # ee course
    if col in ohe_eecourse_ee.columns:
        multi_cols_ee.append(("EE course", col))

    # attitudes
    elif col.split()[0] in attitudes:
        multi_cols_ee.append((col.split()[0], col))

    # EE feedback
    elif "Lecture feedback" in col:
        multi_cols_ee.append(("Lecture feedback", col))

    # self assessment
    elif "Self-assessment" in col:
        multi_cols_ee.append(("Self-assessment", col))

    # major
    elif col in majors:
        multi_cols_ee.append(("Field", col))

    # career
    elif "Career" in col:
        multi_cols_ee.append(("Career", col))
        
    elif "Reasoning" in col:
        multi_cols_ee.append(("Reasoning", col))

    # the rest
    else:
        multi_cols_ee.append((col, ""))

ee_data.columns = pd.MultiIndex.from_tuples(multi_cols_ee)

In [320]:
multi_cols_cg = []

for col in cg_data.columns:
    
    # ee course
    if col in ohe_eecourse_cg.columns:
        multi_cols_cg.append(("EE course", col))

    # cg course
    elif col in ohe_cgcourse.columns:
        multi_cols_cg.append(("CG course", col))

    # attitudes
    elif col.split()[0] in attitudes:
        multi_cols_cg.append((col.split()[0], col))

    # major
    elif col in majors:
        multi_cols_cg.append(("Field", col))

    # career
    elif "Career" in col:
        multi_cols_cg.append(("Career", col))
        
    elif "Reasoning" in col:
        multi_cols_cg.append(("Reasoning", col))

    # the rest
    else:
        multi_cols_cg.append((col, ""))

cg_data.columns = pd.MultiIndex.from_tuples(multi_cols_cg)

### Format the Likert values
Convert to numerical values only.

In [321]:
likert_map = {"1 - Strongly Disagree": 1,
              "2": 2,
              "3": 3,
              "4 - Neutral": 4,
              "5": 5,
              "6": 6,
              "7 - Strongly Agree": 7
             }

likert_cols = attitudes + ["Lecture feedback", "Self-assessment", "Career"]

In [322]:
for att in likert_cols:
    for col in ee_data[att].columns:
        ee_data[(att, col)] = ee_data[(att, col)].map(likert_map)

        # only attitudes for control group
        if "Lecture feedback" not in col and "Self-assessment" not in col:
            cg_data[(att, col)] = cg_data[(att, col)].map(likert_map)

### Rename subcolumns to remove duplication of the targeted variable

In [323]:
likert_cols += ["Reasoning"]

for att in likert_cols:
    new_col_names = {}
    for col in ee_data[att].columns:
        new_name = col.split()[-1]
        new_col_names[col] = new_name
    ee_data.rename(columns=new_col_names, level=1, inplace=True)
    cg_data.rename(columns=new_col_names, level=1, inplace=True)

### Save new file that can be used for data analysis

In [324]:
clean_ee_file = ee_file.replace(".csv", "_clean.csv")
clean_cg_file = cg_file.replace(".csv", "_clean.csv")

ee_data.to_csv(os.path.join(data_dir, clean_ee_file), index=False)
cg_data.to_csv(os.path.join(data_dir, clean_cg_file), index=False)