- Handling warnings

In [1]:
import warnings
warnings.filterwarnings("ignore")

- Loading modules

In [2]:
import pandas as pd
import numpy as np
import json

- Implementation of Auxiliar functions

In [3]:
def get_average_measure(morning_d1, morning_d2):

    if not pd.isna(morning_d1) and not pd.isna(morning_d2):
        average_morning = np.mean([morning_d1, morning_d2])
    elif not pd.isna(morning_d1):
        average_morning = morning_d1
    elif not pd.isna(morning_d2):
        average_morning = morning_d2
    else:
        average_morning = np.nan
    
    return average_morning

In [4]:
def get_average_difference(row):
    diffs = [] 

    if pd.notna(row['IgA D1-Afternoon']) and pd.notna(row['IgA D1-Morning']):
        diffs.append(row['IgA D1-Afternoon'] - row['IgA D1-Morning'])
    if pd.notna(row['IgA D2-Afternoon']) and pd.notna(row['IgA D2-Morning']):
        diffs.append(row['IgA D2-Afternoon'] - row['IgA D2-Morning'])
    
    if len(diffs) > 0:
        return np.mean(diffs)
    else:
        return np.nan

- Reading raw data

In [5]:
df_data = pd.read_csv("../raw_data/sIgA_data/raw_data.csv")
df_data.head(5)

Unnamed: 0,m1,m2,m3,m4,N°_diada,BienestarV1,BienestarV2,BienestarAv,Género_bebé,Edad_bebé_1,...,ads_a_i_pv_2,ads_m_i_pv_2_r,ads_v_i_pv_r,ads_ta_i_pv_2_r,ads_sa_i_pv_r,ads_a_i_pv_2_r,ads_p_i_pv_r,countinsecurei_2020,countsecurei_2020,ADS2i_2020
0,20.076728,31.719207,23.453752,25.016903,1,5.0,4.0,4.5,1,5,...,3.0,0.0,1.0,0.0,1.0,1.0,0.0,3,3,0
1,28.58492,82.400761,28.361209,19.363161,4,,,,1,10,...,4.0,0.0,1.0,0.0,1.0,1.0,0.0,3,3,0
2,161.424693,46.867951,,,5,,,,0,5,...,3.0,0.0,1.0,0.0,1.0,1.0,0.0,3,3,0
3,26.747838,,21.599136,35.942378,7,4.0,4.0,4.0,0,12,...,3.0,0.0,1.0,0.0,0.0,1.0,0.0,4,2,0
4,118.901337,,37.125222,382.715268,11,4.0,4.0,4.0,1,8,...,,,,,,,,999,999,999


- Defining columns to use

In [6]:
columns_to_use = ["N°_diada", 'm1','m2','m3','m4', 'Género_bebé', "Apego_dic_SSP"]

- Selecting data from raw dataset

In [7]:
data_selection = df_data[columns_to_use]
data_selection.head(5)

Unnamed: 0,N°_diada,m1,m2,m3,m4,Género_bebé,Apego_dic_SSP
0,1,20.076728,31.719207,23.453752,25.016903,1,0.0
1,4,28.58492,82.400761,28.361209,19.363161,1,1.0
2,5,161.424693,46.867951,,,0,1.0
3,7,26.747838,,21.599136,35.942378,0,0.0
4,11,118.901337,,37.125222,382.715268,1,


- Change name columns

In [8]:
data_selection.columns = ["ID", 'IgA D1-Morning', 'IgA D1-Afternoon', 
                          'IgA D2-Morning', 'IgA D2-Afternoon', 'Gender', 'Attachment']
data_selection.head(5)

Unnamed: 0,ID,IgA D1-Morning,IgA D1-Afternoon,IgA D2-Morning,IgA D2-Afternoon,Gender,Attachment
0,1,20.076728,31.719207,23.453752,25.016903,1,0.0
1,4,28.58492,82.400761,28.361209,19.363161,1,1.0
2,5,161.424693,46.867951,,,0,1.0
3,7,26.747838,,21.599136,35.942378,0,0.0
4,11,118.901337,,37.125222,382.715268,1,


- Get average morning and afternoon

In [9]:
data_selection["sIgA_Average_Morning"] = data_selection.apply(
    lambda row: get_average_measure(row["IgA D1-Morning"], row["IgA D2-Morning"]),
    axis=1
)

In [10]:
data_selection["sIgA_Average_Afternoon"] = data_selection.apply(
    lambda row: get_average_measure(row["IgA D1-Afternoon"], row["IgA D2-Afternoon"]),
    axis=1
)

- Get average differences

In [11]:
data_selection["sIgA_Average_Difference"] = data_selection.apply(get_average_difference, axis=1)

- Remove non necessary columns

In [12]:
data_selection = data_selection[['ID', 'Gender', 'sIgA_Average_Morning', 
                                 'sIgA_Average_Afternoon', 'sIgA_Average_Difference', 
                                 'Attachment', 'IgA D1-Morning', 'IgA D1-Afternoon', 
                          'IgA D2-Morning', 'IgA D2-Afternoon']]

- Working with Gender

In [13]:
with open("../raw_data/sIgA_data/desc_columns.json", 'r') as doc_open:
    definitions_variables_doc = json.load(doc_open)

In [14]:
definitions_variables_doc[1]['Género_bebé']

[{'nominal_value': '0', 'description': 'Femenino'},
 {'nominal_value': '1', 'description': 'Masculino'}]

In [15]:
data_selection["Gender"] = data_selection["Gender"].replace({1: 'Male', 0: 'Female'})
data_selection["Gender"].value_counts()

Gender
Female    20
Male      15
Name: count, dtype: int64

- Checking attachment

In [16]:
data_selection["Attachment"].value_counts()

Attachment
1.0    18
0.0     9
Name: count, dtype: int64

In [17]:
for index in range(len(definitions_variables_doc)):
    keys = list(definitions_variables_doc[index].keys())
    if keys[0] == "Apego_dic_SSP":
        break

definitions_variables_doc[index]

{'Apego_dic_SSP': [{'nominal_value': '0', 'description': 'insecure'},
  {'nominal_value': '1', 'description': 'secure'}]}

In [18]:
data_selection['Attachment'] = data_selection['Attachment'].replace(
    {1:'Secure', 0:'Insecure', np.nan:'Unknown'})
data_selection['Attachment'].value_counts()

Attachment
Secure      18
Insecure     9
Unknown      8
Name: count, dtype: int64

- Transform data into log-scale 

In [19]:
data_selection['D1_morning_log'] = np.log(data_selection['IgA D1-Morning'])
data_selection['D2_morning_log'] = np.log(data_selection['IgA D2-Morning'])
data_selection['D1_afternoon_log'] = np.log(data_selection['IgA D1-Afternoon'])
data_selection['D2_afternoon_log'] = np.log(data_selection['IgA D2-Afternoon'])

In [20]:
data_selection["sIgA_Average_Afternoon_log"] = data_selection.apply(
    lambda row: get_average_measure(row["D1_afternoon_log"], row["D2_afternoon_log"]),
    axis=1
)

data_selection["sIgA_Average_Morning_log"] = data_selection.apply(
    lambda row: get_average_measure(row["D1_morning_log"], row["D2_morning_log"]),
    axis=1
)

data_selection

Unnamed: 0,ID,Gender,sIgA_Average_Morning,sIgA_Average_Afternoon,sIgA_Average_Difference,Attachment,IgA D1-Morning,IgA D1-Afternoon,IgA D2-Morning,IgA D2-Afternoon,D1_morning_log,D2_morning_log,D1_afternoon_log,D2_afternoon_log,sIgA_Average_Afternoon_log,sIgA_Average_Morning_log
0,1,Male,21.76524,28.368055,6.602815,Insecure,20.076728,31.719207,23.453752,25.016903,2.999561,3.15503,3.456922,3.219552,3.338237,3.077296
1,4,Male,28.473065,50.881961,22.408896,Secure,28.58492,82.400761,28.361209,19.363161,3.352879,3.345022,4.411595,2.963372,3.687484,3.348951
2,5,Female,161.424693,46.867951,-114.556742,Secure,161.424693,46.867951,,,5.084039,,3.847334,,3.847334,5.084039
3,7,Female,24.173487,35.942378,14.343241,Insecure,26.747838,,21.599136,35.942378,3.286454,3.072653,,3.581917,3.581917,3.179553
4,11,Male,78.013279,382.715268,345.590046,Unknown,118.901337,,37.125222,382.715268,4.778294,3.614297,,5.947291,5.947291,4.196295
5,12,Female,29.55487,45.078186,15.523316,Secure,24.10479,71.998771,35.00495,18.157601,3.182411,3.555489,4.276649,2.899089,3.587869,3.36895
6,14,Female,51.715304,43.156162,-8.559142,Secure,17.789599,55.793223,85.64101,30.519101,2.878614,4.450164,4.021652,3.418353,3.720003,3.664389
7,15,Female,19.116347,146.183135,127.066789,Unknown,12.355559,224.737565,25.877134,67.628706,2.514106,3.25336,5.414933,4.214033,4.814483,2.883733
8,16,Female,57.476955,73.891936,16.41498,Secure,79.173752,108.51294,35.780159,39.270931,4.371645,3.577394,4.686869,3.670485,4.178677,3.974519
9,17,Female,15.98167,28.1785,12.19683,Secure,16.30505,23.264672,15.65829,33.092328,2.791475,2.751001,3.146936,3.499301,3.323119,2.771238


In [21]:
data_selection['sIgA_Average_Difference_log'] = data_selection['sIgA_Average_Afternoon_log'] - data_selection['sIgA_Average_Morning_log']

In [22]:
data_selection.columns

Index(['ID', 'Gender', 'sIgA_Average_Morning', 'sIgA_Average_Afternoon',
       'sIgA_Average_Difference', 'Attachment', 'IgA D1-Morning',
       'IgA D1-Afternoon', 'IgA D2-Morning', 'IgA D2-Afternoon',
       'D1_morning_log', 'D2_morning_log', 'D1_afternoon_log',
       'D2_afternoon_log', 'sIgA_Average_Afternoon_log',
       'sIgA_Average_Morning_log', 'sIgA_Average_Difference_log'],
      dtype='object')

In [23]:
data_selection = data_selection[['ID', 'Gender', 'sIgA_Average_Morning', 'sIgA_Average_Afternoon',
       'sIgA_Average_Difference', 'sIgA_Average_Morning_log', 'sIgA_Average_Afternoon_log', 'sIgA_Average_Difference_log', 'Attachment',]]
data_selection.head()

Unnamed: 0,ID,Gender,sIgA_Average_Morning,sIgA_Average_Afternoon,sIgA_Average_Difference,sIgA_Average_Morning_log,sIgA_Average_Afternoon_log,sIgA_Average_Difference_log,Attachment
0,1,Male,21.76524,28.368055,6.602815,3.077296,3.338237,0.260941,Insecure
1,4,Male,28.473065,50.881961,22.408896,3.348951,3.687484,0.338533,Secure
2,5,Female,161.424693,46.867951,-114.556742,5.084039,3.847334,-1.236705,Secure
3,7,Female,24.173487,35.942378,14.343241,3.179553,3.581917,0.402364,Insecure
4,11,Male,78.013279,382.715268,345.590046,4.196295,5.947291,1.750996,Unknown


- Exporting processed data

In [24]:
data_selection.to_csv("../processed_data/1_processed_data_IgA.csv", index=False)