In [85]:
import pandas as pd

Unindo as tabelas de português e matemática

In [86]:
def merged_database():
    df_mat = pd.read_csv("student-db/student-mat.csv", sep=";")
    df_por = pd.read_csv("student-db/student-por.csv", sep=";")

    id_cols = ["school","sex","age","address","famsize","Pstatus", "Medu","Fedu","Mjob","Fjob","reason","nursery","internet"]

    return pd.merge(df_mat, df_por, on=id_cols, suffixes=("_mat","_por"))

merged = merged_database()
display(merged)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_por,freetime_por,goout_por,Dalc_por,Walc_por,health_por,absences_por,G1_por,G2_por,G3_por
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
378,MS,F,18,U,GT3,T,1,1,other,other,...,3,4,4,2,2,5,3,7,8,7
379,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
380,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


Comparando divergências entre campos que assumimos como iguais

In [None]:
def compare_duplicate_columns(merged: pd.DataFrame):
    diff_report = []

    dup_cols = [c.replace("_mat", "") for c in merged.columns if c.endswith("_mat")]

    for col in dup_cols:
        col_mat = col + "_mat"
        col_por = col + "_por"

        one_null = merged[col_mat].isna() ^ merged[col_por].isna()   # XOR: só um é nulo
        both_present_diff = (~merged[col_mat].isna() & ~merged[col_por].isna()) & (merged[col_mat] != merged[col_por])

        diff_report.append({
            "coluna": col,
            "total_diferencas": one_null.sum() + both_present_diff.sum(),
            "valores_distintos": both_present_diff.sum(),
        })

    return pd.DataFrame(diff_report)

comparation = compare_duplicate_columns(merged)
display(comparation)

Unnamed: 0,coluna,total_diferencas,valores_distintos
0,guardian,6,6
1,traveltime,5,5
2,studytime,9,9
3,failures,53,53
4,schoolsup,3,3
5,famsup,5,5
6,paid,171,171
7,activities,5,5
8,higher,4,4
9,romantic,6,6


Excluindo registros com divergências

In [88]:
def delete_divergent_lines(merged:pd.DataFrame):
    filter = (
        (merged["studytime_por"] != merged["studytime_mat"]) |
        (merged["traveltime_por"] != merged["traveltime_mat"]) |
        (merged["freetime_por"] != merged["freetime_mat"]) |
        (merged["famrel_por"] != merged["famrel_mat"]) |
        (merged["famsup_por"] != merged["famsup_mat"]) |
        (merged["schoolsup_por"] != merged["schoolsup_mat"]) |
        (merged["health_por"] != merged["health_mat"]) |
        (merged["goout_por"] != merged["goout_mat"]) |
        (merged["romantic_por"] != merged["romantic_mat"]) |
        (merged["activities_por"] != merged["activities_mat"]) |
        (merged["Walc_por"] != merged["Walc_mat"]) |
        (merged["Dalc_por"] != merged["Dalc_mat"]) |
        (merged["guardian_por"] != merged["guardian_mat"]) |
        (merged["higher_por"] != merged["higher_mat"])
    )
    return merged.loc[~filter].reset_index(drop=True)

merged = delete_divergent_lines(merged)
display(merged)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_por,freetime_por,goout_por,Dalc_por,Walc_por,health_por,absences_por,G1_por,G2_por,G3_por
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
366,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
367,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
368,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


Excluindo colunas redundantes

In [89]:
def drop_redundant_columns(merged: pd.DataFrame):
    redundant_columns = ["studytime_por", "traveltime_por", "freetime_por", "famrel_por", "famsup_por", "schoolsup_por", "health_por", "goout_por", "romantic_por", "activities_por", "Walc_por", "Dalc_por", "guardian_por", "higher_por"]
    return merged.drop(columns=redundant_columns)

merged = drop_redundant_columns(merged)
display(merged)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,absences_mat,G1_mat,G2_mat,G3_mat,failures_por,paid_por,absences_por,G1_por,G2_por,G3_por
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,6,5,6,6,0,no,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,4,5,5,6,0,no,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,10,7,8,10,0,no,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,15,14,15,0,no,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,6,10,10,0,no,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,MS,F,19,R,GT3,T,2,3,services,other,...,0,7,5,0,1,no,4,10,11,10
366,MS,F,18,U,LE3,T,3,1,teacher,services,...,0,7,9,8,0,no,4,15,15,16
367,MS,F,18,U,GT3,T,1,1,other,other,...,0,6,5,0,0,no,6,11,12,9
368,MS,M,17,U,LE3,T,3,1,services,services,...,3,14,16,16,0,no,6,10,10,10


Renomemando colunas que se aplicam tanto para português quanto para matemática

In [90]:
def rename_columns(merged:pd.DataFrame):
    cols_to_rename = [
        "studytime_mat", "traveltime_mat", "freetime_mat", "famrel_mat", "famsup_mat", "schoolsup_mat", "health_mat", "goout_mat", "romantic_mat", "activities_mat", "Walc_mat", "Dalc_mat", "guardian_mat", "higher_mat"
    ]
    
    rename_dict = {col: col.replace("_mat", "") for col in cols_to_rename}
    return merged.rename(columns=rename_dict)

merged = rename_columns(merged)
display(merged)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,absences_mat,G1_mat,G2_mat,G3_mat,failures_por,paid_por,absences_por,G1_por,G2_por,G3_por
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,6,5,6,6,0,no,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,4,5,5,6,0,no,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,10,7,8,10,0,no,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,15,14,15,0,no,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,6,10,10,0,no,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,MS,F,19,R,GT3,T,2,3,services,other,...,0,7,5,0,1,no,4,10,11,10
366,MS,F,18,U,LE3,T,3,1,teacher,services,...,0,7,9,8,0,no,4,15,15,16
367,MS,F,18,U,GT3,T,1,1,other,other,...,0,6,5,0,0,no,6,11,12,9
368,MS,M,17,U,LE3,T,3,1,services,services,...,3,14,16,16,0,no,6,10,10,10


Salvando dataframe em arquivo student.csv

In [91]:
merged.to_csv("student-db/student.csv", index=False)