# Loading Data Sets

In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
def scan_directory(directory_path, file_extension=".xlsx"):
    data_files = []
    try:
        directory = Path(directory_path)
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith(file_extension):
                    file_path = Path(root) / file
                    relative_path = str(file_path.relative_to(directory))
                    data_files.append(relative_path)

        return data_files
    except Exception as e:
        print("❌ Error scanning directory: {e}")
        return []


In [3]:
def read_files(files_list):
    dfs = []
    for file in files_list:
        df = pd.read_excel(file)
        dfs.append(df)
    
    return dfs

In [4]:
directory = os.getcwd()

dfs = read_files(scan_directory(directory, file_extension=".xlsx"))


# Data Cleanup

In [5]:
def create_clean_df(list_of_dfs):
    merged_df = list_of_dfs[0]
    for i in range(1, len(dfs)):
        merged_df = pd.merge(merged_df, dfs[i], on="Name", how="inner")

    merged_df = merged_df.dropna(axis=1, how="all")

    return merged_df

In [6]:
all_df = create_clean_df(dfs)

all_df

Unnamed: 0,Name,VH,VL,LC Class,Source,Source Detaileda,Disclaimers and Known Issues,Notes_x,HEK Titer (mg/L),Fab Tm by DSF (°C),...,CSI-BLI Delta Response (nm),ELISA,BVP ELISA,Light chain class,Type,Original mAb Isotype or Format,Clinical Status,Phagec,Year Name Proposed,Notes_y
0,abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,kappa,WHO-INN,PL109,,aPL and RL refer to WHO-INN publications for p...,89.555458,75.5,...,0.00,1.137375,2.720799,kappa,ZU,IgG2,Phase 2,No,2013,a Made with human isotype
1,abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,kappa,WHO-INN,PL111,,,100.223196,71.0,...,-0.02,1.124624,1.818303,kappa,HU,IgG2,Phase 2,No,2014,"b Mixed chimeric, humanized LC and HC"
2,adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,kappa,PDB,4NYL,,,134.928638,71.0,...,-0.01,1.075515,1.488186,kappa,HU,IgG1,Approved,Yes,1999,c Most as those molecules labeled Yes in the p...
3,alemtuzumab,QVQLQESGPGLVRPSQTLSLTCTVSGFTFTDFYMNWVRQPPGRGLE...,DIQMTQSPSSLSASVGDRVTITCKASQNIDKYLNWYQQKPGKAPKL...,kappa,PDB,1BEY,,,144.653543,74.5,...,-0.02,1.161491,1.464226,kappa,ZU,IgG1,Approved,No,2000,
4,alirocumab,EVQLVESGGGLVQPGGSLRLSCAASGFTFNNYAMNWVRQAPGKGLD...,DIVMTQSPDSLAVSLGERATINCKSSQSVLYRSNNRNFLGWYQQKP...,kappa,WHO-INN,PL107,,,69.232345,71.5,...,-0.01,1.196224,2.179700,kappa,HU,IgG1,Approved,No,2012,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,vedolizumab,QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLE...,DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPG...,kappa,WHO-INN,PL100,,,221.762037,80.5,...,-0.02,1.152671,1.584027,kappa,ZU,IgG1AA-mut,Approved,No,2008,
133,veltuzumab,QVQLQQSGAEVKKPGSSVKVSCKASGYTFTSYNMHWVKQAPGQGLE...,DIQLTQSPSSLSASVGDRVTMTCRASSSVSYIHWFQQKPGKAPKPW...,kappa,WHO-INN,PL98,,,224.953517,70.0,...,-0.02,0.888809,1.211236,kappa,ZU,IgG1,Phase 2,No,2007,
134,visilizumab,QVQLVQSGAEVKKPGASVKVSCKASGYTFISYTMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCSASSSVSYMNWYQQKPGKAPKRL...,kappa,US Patent,US7381803,,,242.006377,71.0,...,0.01,1.880772,4.799334,kappa,ZU,IgG2,Phase 3,No,2000,
135,zalutumumab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTYGMHWVRQAPGKGLE...,AIQLTQSPSSLSASVGDRVTITCRASQDISSALVWYQQKPGKAPKL...,kappa,US Patent,US8586041,,,200.506690,72.5,...,-0.03,1.284363,2.896506,kappa,HU,IgG1,Phase 3,No,2005,
