# 1. Preprocessing

This is the first Jupiter Notebook that has to be runned so raw database is preprocessed for the analysis. Also there is a 1.1 section where simple demographic statistics are calculated.

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway, kruskal
from scipy.stats import shapiro, anderson
import scipy.stats as stats
from scipy.stats import ttest_rel
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from statsmodels.formula.api import ols


In [2]:
# Load dataset
df = pd.read_csv('~/projects/6articulodbs/raw/data.csv', sep=';')
# divide in columns by ;


In [3]:

# Create a dataframe for one electrode per row, because image and brainsense correlation can be done for invididual electrodes

df_d = df[['CIC', 'BETA DERECHO','FRECUENCIA BETA DERECHO','MAGNITUD BETA DERECHO','LEAD DER', 'IMAGEN DERECHO', 'CAMBIO DER']]
df_d.columns = ['ID', 'BETA', 'FRECUENCIA', 'MAGNITUD', 'LEAD', 'IMAGEN', 'CAMBIO']

df_i = df[['CIC', 'BETA IZQUIERDO', 'FRECUENCIA BETA IZQUIERDO', 'MAGNITUD BETA IZQUIERDO', 'LEAD IZQ', 'IMAGEN IZQUIERDO', 'CAMBIO IZQ']]
df_i.columns = ['ID', 'BETA', 'FRECUENCIA', 'MAGNITUD', 'LEAD', 'IMAGEN', 'CAMBIO']


In [None]:

# Concatene df_e y df_nueva
df_e = pd.concat([df_d, df_i], ignore_index=True)


In [None]:

# Drop NaNs
df_e = df_e.dropna(subset=['FRECUENCIA', 'MAGNITUD']).reset_index(drop=True)

In [None]:

# fix comas and dots because decimals in raw data are sometimes like 0.1 and other times as 0,1
df_e['FRECUENCIA'] = df_e['FRECUENCIA'].astype(str).str.replace(',', '.')
df_e['MAGNITUD'] = df_e['MAGNITUD'].astype(str).str.replace(',', '.')
df_e['FRECUENCIA'] = df_e['FRECUENCIA'].astype(float)
df_e['MAGNITUD'] = df_e['MAGNITUD'].astype(float)

# fix IMAGE and BETA so you just have one value per 
def check_contact_in_image(row):
    beta = str(row['BETA'])
    image_contacts = row['IMAGEN'].split('=')
    
    # Verify is BETA is already in the list of contacts in IMAGEN
    if beta in image_contacts:
        return beta
    else:
        return image_contacts[0]  # Return first value as fallback

# update IMAGEN
df_e['UIMAGEN'] = df_e.apply(check_contact_in_image, axis=1)
df_e['HEMISFERIO'] = np.where(df_e['BETA'].isin([8, 9, 10, 11]), 1, 0) # 1 right, 0 left
df_e['UIMAGEN'] = df_e['UIMAGEN'].astype(int)
df_e['COCANAL'] = np.where(df_e['BETA'] == df_e['UIMAGEN'], 1, 0)

df_e.to_csv('~/projects/6articulodbs/derivates/df_preprocessed.csv', index=False)



# 1.1 Demographic statistics

In [None]:
# Age
mean_age = df['EDAD'].mean()
std_age = df['EDAD'].std()
range = df['EDAD'].max() - df['EDAD'].min()
percentil_25_age = df['EDAD'].quantile(0.25)
percentil_75_age = df['EDAD'].quantile(0.75)

# Sex (women 1, man 2)
fem_sex = df['SEXO'].value_counts()[1] 
masc_sex = df['SEXO'].value_counts()[2]
proportion_fem = (fem_sex/(fem_sex+masc_sex))*100
proportion_masc = (masc_sex/(fem_sex+masc_sex))*100

# Disease duration
mean_duration = df['DD'].mean()
std_duration = df['DD'].std()
range_duration = df['DD'].max() - df['DD'].min()
percentil_25_duration = df['DD'].quantile(0.25)
percentil_75_duration = df['DD'].quantile(0.75)


# print results
print('Age')
print('Mean:', mean_age)
print('Standard deviation:', std_age)
print('Range:', range)
print('Percentil 25:', percentil_25_age)
print('Percentil 75:', percentil_75_age)
print('Sex')
print('Female:', fem_sex)
print('Male:', masc_sex)
print('Proportion fem:', proportion_fem)
print('Proportion masc:', proportion_masc)
print('Disease duration')
print('Mean:', mean_duration)
print('Standard deviation:', std_duration)
print('Range:', range_duration)
print('Percentil 25:', percentil_25_duration)
print('Percentil 75:', percentil_75_duration)

Age
Mean: 61.672413793103445
Standard deviation: 8.236393204476931
Range: 33.0
Percentil 25: 56.25
Percentil 75: 68.0
Sex
Female: 23
Male: 36
Proportion fem: 38.983050847457626
Proportion masc: 61.016949152542374
Disease duration
Mean: 10.431034482758621
Standard deviation: 3.1628993414909865
Range: 13.0
Percentil 25: 8.0
Percentil 75: 12.75
