### Preprocessing input data

The input dataset hast two samples collected from different sources.

This notebook facilitate the analysis of collected sample working on the following steps:

1. Processing columns and rename
2. Estimating category
3. Evaluating null examples
4. Concatenating samples
5. Generating a unique dataset

In [1]:
import warnings
warnings.filterwarnings("ignore")

- Load libraries

In [None]:
import pandas as pd

#### Read and preprocess dataset

The dataset has two samples from different sources

In [3]:
df_data_rs = pd.read_excel("../../raw_data/HCS.xlsx", sheet_name="cohorte_rs")
df_data_and = pd.read_excel("../../raw_data/HCS.xlsx", sheet_name="cohorte_and")

In [4]:
df_data_rs = df_data_rs[['edad', 'peso', 'talla', 'imc', 'glicemia', 'pas1t', 'pad1t',
       'ct1', 'tg1', 'hdl1', 'ldl1', 'tipo']]
df_data_rs.shape

(77, 12)

In [5]:
df_data_and = df_data_and[['edad', 'peso', 'talla', 'imc', 'glicemia', 'pas1t', 'pad1t',
       'ct1', 'tg1', 'hdl1', 'ldl1', 'ct3']]
df_data_and.shape

(83, 12)

In [6]:
df_data_and["ct3"] = (df_data_and["ct3"] > 290).astype(int)

In [7]:
df_data_and.rename(columns={"ct3":"tipo"}, inplace=True)
df_data_and

Unnamed: 0,edad,peso,talla,imc,glicemia,pas1t,pad1t,ct1,tg1,hdl1,ldl1,tipo
0,33,68.0,169,23.80,76.2,101,60,239,85.1,69.0,153.0,1
1,37,58.8,162,22.40,71.9,101,58,222,97.0,65.6,137.0,1
2,30,63.0,160,24.60,79.8,113,58,197,100.0,56.3,120.7,1
3,26,49.0,154,20.70,81.3,98,53,228,67.0,80.4,134.2,1
4,25,62.0,163,23.30,82.5,106,61,215,130.0,52.6,136.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
78,30,53.0,156,21.70,85.1,140,88,158,55.0,43.5,103.5,0
79,32,63.0,169,22.05,74.2,109,72,238,123.0,84.1,129.3,0
80,34,66.0,155,27.50,79.5,104,69,216,183.0,56.8,122.6,0
81,37,88.5,171,30.20,85.2,106,53,146,97.0,54.0,72.6,0


In [8]:
df_data_and.dtypes

edad          int64
peso        float64
talla         int64
imc         float64
glicemia    float64
pas1t         int64
pad1t         int64
ct1           int64
tg1         float64
hdl1        float64
ldl1        float64
tipo          int64
dtype: object

In [9]:
df_data_and["imc_check"] = df_data_and["peso"]/(df_data_and["talla"]/100 * df_data_and["talla"]/100)

df_data_and = df_data_and.round(decimals=1)

df_data_and["is_imc_correct"] = df_data_and["imc_check"] == df_data_and["imc"]
df_data_and["is_imc_correct"].value_counts()

is_imc_correct
True     70
False    13
Name: count, dtype: int64

In [10]:
df_data_rs["imc_check"] = df_data_rs["peso"]/(df_data_rs["talla"]/100 * df_data_rs["talla"]/100)

df_data_rs = df_data_rs.round(decimals=1)

df_data_rs["is_imc_correct"] = df_data_rs["imc_check"] == df_data_rs["imc"]
df_data_rs["is_imc_correct"].value_counts()

is_imc_correct
True     48
False    29
Name: count, dtype: int64

In [11]:
df_data_rs = df_data_rs.drop(columns=["imc", "is_imc_correct"])
df_data_and = df_data_and.drop(columns=["imc", "is_imc_correct"])

In [12]:
df_data_and.rename(columns={"imc_check": "imc"}, inplace=True)
df_data_rs.rename(columns={"imc_check": "imc"}, inplace=True)

In [13]:
df_data_rs.dropna().shape

(65, 12)

In [14]:
df_data_and.dropna().shape

(83, 12)

In [15]:
rs_null_check = df_data_rs.isna().astype(int)

matrix_summary = []

for column in rs_null_check.columns:
    row = [column, 0, 0]

    value_counts = rs_null_check[column].value_counts()

    if 1 in value_counts.index:
        row[1] = value_counts[1]
    if 0 in value_counts.index:
        row[2] = value_counts[0]
    
    matrix_summary.append(row)

df_summary_nulls_rs = pd.DataFrame(data=matrix_summary, columns=["descriptor", "Is Null", "Is not null"])
df_summary_nulls_rs

Unnamed: 0,descriptor,Is Null,Is not null
0,edad,0,77
1,peso,1,76
2,talla,2,75
3,glicemia,8,69
4,pas1t,4,73
5,pad1t,4,73
6,ct1,0,77
7,tg1,0,77
8,hdl1,0,77
9,ldl1,0,77


In [16]:
df_data_rs["null_example"] = rs_null_check.sum(axis=1, numeric_only=True)
df_data_rs["null_example"] = (df_data_rs["null_example"]>0).astype(int)
df_data_rs["null_example"].value_counts()

null_example
0    65
1    12
Name: count, dtype: int64

In [17]:
df_data_rs[df_data_rs["null_example"] == 1]["tipo"].value_counts()

tipo
0    10
1     2
Name: count, dtype: int64

In [18]:
df_data_rs[df_data_rs["null_example"] == 1]

Unnamed: 0,edad,peso,talla,glicemia,pas1t,pad1t,ct1,tg1,hdl1,ldl1,tipo,imc,null_example
3,35,68.0,165.0,68.0,,,146,37,64,36,0,25.0,1
11,37,72.0,165.0,,100.0,60.0,139,110,27,97,0,26.4,1
14,37,55.0,153.0,,100.0,60.0,188,198,45,104,0,23.5,1
32,13,,,86.0,90.0,50.0,147,83,35,95,0,,1
37,37,66.0,168.0,,,,175,81,52,107,0,23.4,1
39,28,86.5,162.0,,90.0,60.0,154,166,49,73,0,33.0,1
46,36,59.0,157.0,,120.0,80.0,244,102,76,143,0,23.9,1
50,31,61.0,,67.0,100.0,60.0,162,106,64,76,0,,1
54,26,68.0,170.0,,,,172,97,68,90,0,23.5,1
56,41,68.0,150.0,,105.0,60.0,184,131,52,142,0,30.2,1


In [19]:
df_data_rs["tipo"].value_counts()

tipo
0    61
1    16
Name: count, dtype: int64

In [20]:
df_data_and["tipo"].value_counts()

tipo
0    54
1    29
Name: count, dtype: int64

In [21]:
df_data_rs = df_data_rs.dropna()
df_data_rs = df_data_rs.drop(columns=["null_example"])
df_data_rs

Unnamed: 0,edad,peso,talla,glicemia,pas1t,pad1t,ct1,tg1,hdl1,ldl1,tipo,imc
0,21,63.0,158.0,70.0,96.0,64.0,140,75,61,64,0,25.2
1,20,112.0,164.0,77.0,80.0,60.0,126,81,36,74,0,41.6
2,19,77.8,165.0,93.0,102.0,64.0,149,105,56,73,0,28.6
4,29,92.0,165.0,90.0,110.0,70.0,149,144,49,71,0,33.8
5,18,79.0,160.0,66.0,106.0,60.0,167,150,22,115,0,30.9
...,...,...,...,...,...,...,...,...,...,...,...,...
71,33,78.0,168.0,134.0,110.0,70.0,188,121,48,108,1,27.6
73,29,66.0,147.0,92.0,120.0,80.0,254,170,81,139,1,30.5
74,36,62.0,155.0,93.0,90.0,60.0,234,240,57,129,1,25.8
75,23,53.0,167.0,72.0,100.0,50.0,184,79,56,112,1,19.0


In [22]:
df_concat = pd.concat([df_data_rs, df_data_and], axis=0)
df_concat

Unnamed: 0,edad,peso,talla,glicemia,pas1t,pad1t,ct1,tg1,hdl1,ldl1,tipo,imc
0,21,63.0,158.0,70.0,96.0,64.0,140,75.0,61.0,64.0,0,25.2
1,20,112.0,164.0,77.0,80.0,60.0,126,81.0,36.0,74.0,0,41.6
2,19,77.8,165.0,93.0,102.0,64.0,149,105.0,56.0,73.0,0,28.6
4,29,92.0,165.0,90.0,110.0,70.0,149,144.0,49.0,71.0,0,33.8
5,18,79.0,160.0,66.0,106.0,60.0,167,150.0,22.0,115.0,0,30.9
...,...,...,...,...,...,...,...,...,...,...,...,...
78,30,53.0,156.0,85.1,140.0,88.0,158,55.0,43.5,103.5,0,21.8
79,32,63.0,169.0,74.2,109.0,72.0,238,123.0,84.1,129.3,0,22.1
80,34,66.0,155.0,79.5,104.0,69.0,216,183.0,56.8,122.6,0,27.5
81,37,88.5,171.0,85.2,106.0,53.0,146,97.0,54.0,72.6,0,30.3


In [23]:
df_concat["tipo"].value_counts()

tipo
0    105
1     43
Name: count, dtype: int64

In [24]:
df_concat.columns

Index(['edad', 'peso', 'talla', 'glicemia', 'pas1t', 'pad1t', 'ct1', 'tg1',
       'hdl1', 'ldl1', 'tipo', 'imc'],
      dtype='object')

In [25]:
df_concat.columns = ['Age', 'Weight', 'Height', 'Clycemia', 'Systolic blood pressure', 'Diastolic blood pressure', 
                     'Cholesterol', 'Triglycerides', 'HDL', 'LDL', 'MSPH', 'BMI']

In [26]:
df_concat.to_csv("../../results/processed_dataset/1_processed_input_dataset.csv", index=False)