In [27]:
import pandas as pd
import numpy as np

from src.auxiliar_functions import AuxFunctions

In [28]:

path_ki67 = "../raw_data/KI67.xlsx"
path_vp = "../raw_data/viability_vs_proliferation.xlsx"

df_ki67 = pd.read_excel(path_ki67)
df_vp = pd.read_excel(path_vp)

print("KI67 shape:", df_ki67.shape)
print("Viability/Proliferation shape:", df_vp.shape)


KI67 shape: (60, 4)
Viability/Proliferation shape: (45, 5)


In [29]:
print("KI67 columns:")
display(df_ki67.columns)

print("\nViability/Proliferation columns:")
display(df_vp.columns)


KI67 columns:


Index(['Line', 'Enzyme', 'Treatment', 'ki67'], dtype='object')


Viability/Proliferation columns:


Index(['Celular line', 'Enzyme', 'Viability', 'Proliferation', 'Treatment'], dtype='object')

In [30]:
df_ki67 = AuxFunctions.normalize_columns(df_ki67)
df_vp = AuxFunctions.normalize_columns(df_vp)

print("Normalized KI67 columns:", df_ki67.columns.tolist())
print("Normalized VP columns:", df_vp.columns.tolist())

Normalized KI67 columns: ['line', 'enzyme', 'treatment', 'ki67']
Normalized VP columns: ['celular_line', 'enzyme', 'viability', 'proliferation', 'treatment']


In [31]:
df_ki67 = df_ki67.rename(columns={
    "line": "cell_line"
})

In [32]:
df_vp = df_vp.rename(columns={
    "celular_line": "cell_line"
})


In [33]:
print("KI67 columns:", df_ki67.columns.tolist())
print("VP columns:", df_vp.columns.tolist())

KI67 columns: ['cell_line', 'enzyme', 'treatment', 'ki67']
VP columns: ['cell_line', 'enzyme', 'viability', 'proliferation', 'treatment']


In [34]:
merge_keys = ["cell_line", "enzyme", "treatment"]

# Sanity check
for k in merge_keys:
    assert k in df_ki67.columns, f"{k} missing in KI67"
    assert k in df_vp.columns, f"{k} missing in VP"


In [35]:
df_ki67["cell_line"].value_counts()

cell_line
RQ       20
DFB      20
MCF-7    20
Name: count, dtype: int64

In [36]:
df_vp["cell_line"].value_counts()

cell_line
RQ       15
DFB      15
MCF-7    15
Name: count, dtype: int64

In [37]:
df_ki67["enzyme"].value_counts()

enzyme
L-Asparaginase        12
L-Glutaminase         12
Serine deaminase      12
Glycine oxidase       12
Arginine deiminase    12
Name: count, dtype: int64

In [47]:
df_ki67["enzyme"] = df_ki67["enzyme"].replace({
    "L-Asparaginase" : "ASP",
    "L-Glutaminase" : "GLU",
    "Serine deaminase" : "SDH",
    "Glycine oxidase" : "GO",
    "Arginine deiminase" : "ADI"
})

In [48]:
df_ki67["enzyme"].value_counts()

enzyme
ASP    12
GLU    12
SDH    12
GO     12
ADI    12
Name: count, dtype: int64

In [39]:
df_vp["enzyme"].value_counts()

enzyme
ASP    9
GLU    9
SDH    9
GO     9
ADI    9
Name: count, dtype: int64

In [40]:
df_ki67.dtypes

cell_line     object
enzyme        object
treatment     object
ki67         float64
dtype: object

In [41]:
df_ki67.head()

Unnamed: 0,cell_line,enzyme,treatment,ki67
0,RQ,L-Asparaginase,Control,97.9
1,RQ,L-Asparaginase,0.5,27.4
2,RQ,L-Asparaginase,0.7,12.0
3,RQ,L-Asparaginase,0.9,78.9
4,RQ,L-Glutaminase,Control,97.9


In [42]:
df_ki67["treatment"] = df_ki67["treatment"].replace({"Control":0})

In [43]:
df_vp.dtypes

cell_line         object
enzyme            object
viability        float64
proliferation    float64
treatment        float64
dtype: object

In [44]:
df_vp["treatment"].unique()

array([0.5, 0.7, 0.9])

In [49]:
df_merged = (
    df_ki67
    .merge(
        df_vp,
        on=merge_keys,
        how="inner",
        suffixes=("_ki67", "_vp")
    )
)

print("Merged shape:", df_merged.shape)
df_merged.head()


Merged shape: (0, 6)


Unnamed: 0,cell_line,enzyme,treatment,ki67,viability,proliferation
