In [None]:
import pandas as pd

Unindo as tabelas de português e matemática

In [None]:
def merged_database():
    df_mat = pd.read_csv("student-db/student-mat.csv", sep=";")
    df_por = pd.read_csv("student-db/student-por.csv", sep=";")

    id_cols = ["school","sex","age","address","famsize","Pstatus", "Medu","Fedu","Mjob","Fjob","reason","nursery","internet"]

    return pd.merge(df_mat, df_por, on=id_cols, suffixes=("_mat","_por"))

merged = merged_database()
display(merged)

Comparando divergências entre campos que assumimos como iguais

In [None]:
def compare_duplicate_columns(merged: pd.DataFrame):
    diff_report = []

    dup_cols = [c.replace("_mat", "") for c in merged.columns if c.endswith("_mat")]

    for col in dup_cols:
        col_mat = col + "_mat"
        col_por = col + "_por"

        one_null = merged[col_mat].isna() ^ merged[col_por].isna()   # XOR: só um é nulo
        both_present_diff = (~merged[col_mat].isna() & ~merged[col_por].isna()) & (merged[col_mat] != merged[col_por])

        diff_report.append({
            "coluna": col,
            "total_diferencas": one_null.sum() + both_present_diff.sum(),
            "valores_distintos": both_present_diff.sum(),
        })

    return pd.DataFrame(diff_report)

comparation = compare_duplicate_columns(merged)
display(comparation)

Excluindo registros com divergências

In [None]:
def delete_divergent_lines(merged:pd.DataFrame):
    filter = (
        (merged["studytime_por"] != merged["studytime_mat"]) |
        (merged["traveltime_por"] != merged["traveltime_mat"]) |
        (merged["freetime_por"] != merged["freetime_mat"]) |
        (merged["famrel_por"] != merged["famrel_mat"]) |
        (merged["famsup_por"] != merged["famsup_mat"]) |
        (merged["schoolsup_por"] != merged["schoolsup_mat"]) |
        (merged["health_por"] != merged["health_mat"]) |
        (merged["goout_por"] != merged["goout_mat"]) |
        (merged["romantic_por"] != merged["romantic_mat"]) |
        (merged["activities_por"] != merged["activities_mat"]) |
        (merged["Walc_por"] != merged["Walc_mat"]) |
        (merged["Dalc_por"] != merged["Dalc_mat"]) |
        (merged["guardian_por"] != merged["guardian_mat"]) |
        (merged["higher_por"] != merged["higher_mat"])
    )
    return merged.loc[~filter].reset_index(drop=True)

merged = delete_divergent_lines(merged)
display(merged)

Excluindo colunas redundantes

In [None]:
def drop_redundant_columns(merged: pd.DataFrame):
    redundant_columns = ["studytime_por", "traveltime_por", "freetime_por", "famrel_por", "famsup_por", "schoolsup_por", "health_por", "goout_por", "romantic_por", "activities_por", "Walc_por", "Dalc_por", "guardian_por", "higher_por"]
    return merged.drop(columns=redundant_columns)

merged = drop_redundant_columns(merged)
display(merged)

Renomemando colunas que se aplicam tanto para português quanto para matemática

In [None]:
def rename_columns(merged:pd.DataFrame):
    cols_to_rename = [
        "studytime_mat", "traveltime_mat", "freetime_mat", "famrel_mat", "famsup_mat", "schoolsup_mat", "health_mat", "goout_mat", "romantic_mat", "activities_mat", "Walc_mat", "Dalc_mat", "guardian_mat", "higher_mat"
    ]
    
    rename_dict = {col: col.replace("_mat", "") for col in cols_to_rename}
    return merged.rename(columns=rename_dict)

merged = rename_columns(merged)
display(merged)

Salvando dataframe em arquivo student.csv

In [None]:
merged.to_csv("student-db/student.csv", index=False)