In [None]:
from sklearn.model_selection import train_test_split
from database import filter_hyposmia
import pandas as pd
from scipy.spatial import KDTree
import numpy as np

# Load the datasets
data_large = pd.read_csv('data/processed/remote_processed.csv', low_memory=False) #target = 0
data_small = pd.read_csv('data/processed/clinical_processed_imputed.csv', low_memory=False) #target = 1

print(len(data_large))
print(len(data_small))

# Filter hyposmia
data_large = filter_hyposmia(data_large, '2023')
data_small = filter_hyposmia(data_small, '2023')

print(len(data_large))
print(len(data_small))

age_var = 'AGE'
sex_var = 'BIRTHSEX'

# Ensure both datasets have age_var and sex_var columns
assert age_var in data_large.columns and sex_var in data_large.columns
assert age_var in data_small.columns and sex_var in data_small.columns

In [None]:
#TODO pegar um dos 10 mais próximos.

# Create a new DataFrame to store the balanced dataset
balanced_data = pd.DataFrame()

# Set to keep track of used indices
used_indices = set()

# Loop through unique combinations of age and sex in the smaller dataset
for (age, sex), group in data_small.groupby([age_var, sex_var]):
    # Find all matching rows in the larger dataset by sex
    matching_rows = data_large[data_large[sex_var] == sex]
    
    if len(matching_rows) > 0:
        # Build a KDTree for the current sex group
        kd_tree = KDTree(matching_rows[[age_var]].values)
        
        # Find the closest age values using KDTree
        _, indices = kd_tree.query([[age]], k=len(matching_rows))
        closest_indices = matching_rows.index[indices.flatten()]
        
        # Filter out already used indices
        closest_indices = [idx for idx in closest_indices if idx not in used_indices]
        
        # If there are not enough unique closest indices, limit to available ones
        closest_indices = closest_indices[:len(group)]
        
        # Update the set of used indices
        used_indices.update(closest_indices)
        
        # Get the sampled rows
        sampled_rows = data_large.loc[closest_indices]
        
        # Append the sampled rows to the balanced dataset
        balanced_data = pd.concat([balanced_data, sampled_rows])
    else:
        # If no matching rows are found, you might want to log this or handle it differently
        print(f"No matching rows found for sex {sex}")

# Reset the index of the balanced dataset
balanced_data.reset_index(drop=True, inplace=True)

print(f"Balanced dataset created with {len(balanced_data)*2} instances.")


In [None]:
print(balanced_data.shape)
print(data_small.shape)

In [None]:
#print mean and std of age and sex
print(balanced_data['AGE'].mean(), data_small['AGE'].mean())
print(balanced_data['AGE'].std(), data_small['AGE'].std())
print(balanced_data['BIRTHSEX'].mean(), data_small['BIRTHSEX'].mean())
print(balanced_data['BIRTHSEX'].std(), data_small['BIRTHSEX'].std())


In [None]:
#check for dupicate in PATNO in both datasets
print(balanced_data['PATNO'].duplicated().sum())
print(data_small['PATNO'].duplicated().sum())
#print which patno is duplicate on data_small
print(data_small[data_small['PATNO'].duplicated()])

In [None]:

remote = pd.read_csv('data/processed/remote_processed.csv')
from tqdm import tqdm

#check, for each PATNO in data_balanced, if it is in remote, and if it has the equal values in all columns
for i in tqdm(range(len(balanced_data))):
    patno = balanced_data['PATNO'].iloc[i]
    if patno in remote['PATNO'].values:
        #check if values are equal for all columns
        for col in balanced_data.columns:
            if col != 'PATNO':
                #ignore if both are nan
                if pd.isnull(balanced_data[col].iloc[i]) and pd.isnull(remote[remote['PATNO'] == patno][col].values[0]):
                    continue
                if balanced_data[col].iloc[i] != remote[remote['PATNO'] == patno][col].values[0]:
                    print(f"Row {i} has different value in column {col}")
                    #print values
                    print(balanced_data[col].iloc[i], remote[remote['PATNO'] == patno][col].values[0])
    else:
        print(f"Row {i} has no matching PATNO {patno} in remote")

In [None]:
# Load the datasets
data_large = balanced_data
data_small = pd.read_csv('data/processed/clinical_processed_imputed.csv')

#filter for HYPOSMIA =1
data_large = data_large[data_large['HYPOSMIA'] == 1]
data_small = data_small[data_small['HYPOSMIA'] == 1]

# Add a target column to each dataset
data_large['target'] = 0
data_small['target'] = 1

# Combine the datasets
combined_data = pd.concat([data_large, data_small])

# Reset index of combined data
combined_data.reset_index(drop=True, inplace=True)

# Define the columns for stratification
age_var = 'AGE'
sex_var = 'BIRTHSEX'
target_var = 'target'

# Ensure both datasets have age_var, sex_var, and target_var columns
assert age_var in combined_data.columns and sex_var in combined_data.columns and target_var in combined_data.columns

# Calculate percentiles for the age variable
percentiles = np.percentile(combined_data[age_var], [20, 40, 60, 80])
labels = ['0-20th', '21-40th', '41-60th', '61-80th', '81-100th']
combined_data['age_binned'] = pd.cut(combined_data[age_var], bins=[combined_data[age_var].min()-1, *percentiles, combined_data[age_var].max()+1], labels=labels, include_lowest=True)

# Create a stratification key
combined_data['stratify_key'] = combined_data.apply(lambda row: f"{row['age_binned']}_{row[sex_var]}_{row[target_var]}", axis=1)

# Split the data into train and test sets
train, test = train_test_split(combined_data, test_size=0.2, stratify=combined_data['stratify_key'], random_state=42)

# Drop the stratification key and binned age column
train = train.drop(columns=['stratify_key', 'age_binned'])
test = test.drop(columns=['stratify_key', 'age_binned'])

#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())

#switch random patients from train to test until target is balanced
while abs(train['target'].mean() - test['target'].mean()) > 0.01:
    #get random patient from train
    random_patient = train.sample()
    #switch to test
    test = pd.concat([test, random_patient])
    train = train.drop(random_patient.index)
    #now switch from test to train
    random_patient = test.sample()
    train = pd.concat([train, random_patient])
    test = test.drop(random_patient.index)
    #print mean of target in train and test
    print(train['target'].mean(), test['target'].mean())
print ("Balanced target")
#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())


#print shapes
print(train.shape)
print(test.shape)

In [None]:
#check if there is any duplicate in PATNO
print(train['PATNO'].duplicated().sum())
print(test['PATNO'].duplicated().sum())

print(train.shape)
print(test.shape)

train_columns = train.columns
test_columns = test.columns

if train_columns.equals(test_columns):
    print("Columns are the same")

for column in train_columns:
    print(column)

In [None]:
#sum all correct columns (that contain "CORRECT")

#print mean and std of total_correct
print(train['TOTAL_CORRECT'].mean(), test['TOTAL_CORRECT'].mean())
#MIN AND MAX
print(train['TOTAL_CORRECT'].min(), test['TOTAL_CORRECT'].min())
print(train['TOTAL_CORRECT'].max(), test['TOTAL_CORRECT'].max())
#print mean and std of TOTAL_CORRECT per target
print(train.groupby('target')['TOTAL_CORRECT'].mean())
print(test.groupby('target')['TOTAL_CORRECT'].mean())
#print mean and std of TOTAL_CORRECT
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
#now per HYPOSMIA
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
 


In [None]:

# Save the train and test sets to new CSV files
train.to_csv('data/processed/train_data_hyposmia2023.csv', index=False)
test.to_csv('data/processed/test_data_hyposmia2023.csv', index=False)

print(f"Train set created with {len(train)} instances.")
print(f"Test set created with {len(test)} instances.")

clinical data without imputation

In [None]:
# Load the datasets
data_large = pd.read_csv('data/processed/remote_processed.csv', low_memory=False) #target = 0
data_small = pd.read_csv('data/processed/clinical_processed.csv', low_memory=False) #target = 1

print(len(data_large))
print(len(data_small))

# Filter hyposmia
data_large = filter_hyposmia(data_large, '2023')
data_small = filter_hyposmia(data_small, '2023')

print(len(data_large))
print(len(data_small))

age_var = 'AGE'
sex_var = 'BIRTHSEX'

# Ensure both datasets have age_var and sex_var columns
assert age_var in data_large.columns and sex_var in data_large.columns
assert age_var in data_small.columns and sex_var in data_small.columns

In [None]:
#TODO pegar um dos 10 mais próximos.

# Create a new DataFrame to store the balanced dataset
balanced_data = pd.DataFrame()

# Set to keep track of used indices
used_indices = set()

# Loop through unique combinations of age and sex in the smaller dataset
for (age, sex), group in data_small.groupby([age_var, sex_var]):
    # Find all matching rows in the larger dataset by sex
    matching_rows = data_large[data_large[sex_var] == sex]
    
    if len(matching_rows) > 0:
        # Build a KDTree for the current sex group
        kd_tree = KDTree(matching_rows[[age_var]].values)
        
        # Find the closest age values using KDTree
        _, indices = kd_tree.query([[age]], k=len(matching_rows))
        closest_indices = matching_rows.index[indices.flatten()]
        
        # Filter out already used indices
        closest_indices = [idx for idx in closest_indices if idx not in used_indices]
        
        # If there are not enough unique closest indices, limit to available ones
        closest_indices = closest_indices[:len(group)]
        
        # Update the set of used indices
        used_indices.update(closest_indices)
        
        # Get the sampled rows
        sampled_rows = data_large.loc[closest_indices]
        
        # Append the sampled rows to the balanced dataset
        balanced_data = pd.concat([balanced_data, sampled_rows])
    else:
        # If no matching rows are found, you might want to log this or handle it differently
        print(f"No matching rows found for sex {sex}")

# Reset the index of the balanced dataset
balanced_data.reset_index(drop=True, inplace=True)

print(f"Balanced dataset created with {len(balanced_data)*2} instances.")


In [None]:
print(balanced_data.shape)
print(data_small.shape)

In [None]:
#print mean and std of age and sex
print(balanced_data['AGE'].mean(), data_small['AGE'].mean())
print(balanced_data['AGE'].std(), data_small['AGE'].std())
print(balanced_data['BIRTHSEX'].mean(), data_small['BIRTHSEX'].mean())
print(balanced_data['BIRTHSEX'].std(), data_small['BIRTHSEX'].std())


In [None]:
#check for dupicate in PATNO in both datasets
print(balanced_data['PATNO'].duplicated().sum())
print(data_small['PATNO'].duplicated().sum())
#print which patno is duplicate on data_small
print(data_small[data_small['PATNO'].duplicated()])

In [None]:

remote = pd.read_csv('data/processed/remote_processed.csv')
from tqdm import tqdm

#check, for each PATNO in data_balanced, if it is in remote, and if it has the equal values in all columns
for i in tqdm(range(len(balanced_data))):
    patno = balanced_data['PATNO'].iloc[i]
    if patno in remote['PATNO'].values:
        #check if values are equal for all columns
        for col in balanced_data.columns:
            if col != 'PATNO':
                #ignore if both are nan
                if pd.isnull(balanced_data[col].iloc[i]) and pd.isnull(remote[remote['PATNO'] == patno][col].values[0]):
                    continue
                if balanced_data[col].iloc[i] != remote[remote['PATNO'] == patno][col].values[0]:
                    print(f"Row {i} has different value in column {col}")
                    #print values
                    print(balanced_data[col].iloc[i], remote[remote['PATNO'] == patno][col].values[0])
    else:
        print(f"Row {i} has no matching PATNO {patno} in remote")

In [None]:
# Load the datasets
data_large = balanced_data
data_small = pd.read_csv('data/processed/clinical_processed.csv')

#filter for HYPOSMIA =1
data_large = data_large[data_large['HYPOSMIA'] == 1]
data_small = data_small[data_small['HYPOSMIA'] == 1]

# Add a target column to each dataset
data_large['target'] = 0
data_small['target'] = 1

# Combine the datasets
combined_data = pd.concat([data_large, data_small])

# Reset index of combined data
combined_data.reset_index(drop=True, inplace=True)

# Define the columns for stratification
age_var = 'AGE'
sex_var = 'BIRTHSEX'
target_var = 'target'

# Ensure both datasets have age_var, sex_var, and target_var columns
assert age_var in combined_data.columns and sex_var in combined_data.columns and target_var in combined_data.columns

# Calculate percentiles for the age variable
percentiles = np.percentile(combined_data[age_var], [20, 40, 60, 80])
labels = ['0-20th', '21-40th', '41-60th', '61-80th', '81-100th']
combined_data['age_binned'] = pd.cut(combined_data[age_var], bins=[combined_data[age_var].min()-1, *percentiles, combined_data[age_var].max()+1], labels=labels, include_lowest=True)

# Create a stratification key
combined_data['stratify_key'] = combined_data.apply(lambda row: f"{row['age_binned']}_{row[sex_var]}_{row[target_var]}", axis=1)

# Split the data into train and test sets
train, test = train_test_split(combined_data, test_size=0.2, stratify=combined_data['stratify_key'], random_state=42)

# Drop the stratification key and binned age column
train = train.drop(columns=['stratify_key', 'age_binned'])
test = test.drop(columns=['stratify_key', 'age_binned'])

#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())

#switch random patients from train to test until target is balanced
while abs(train['target'].mean() - test['target'].mean()) > 0.01:
    #get random patient from train
    random_patient = train.sample()
    #switch to test
    test = pd.concat([test, random_patient])
    train = train.drop(random_patient.index)
    #now switch from test to train
    random_patient = test.sample()
    train = pd.concat([train, random_patient])
    test = test.drop(random_patient.index)
    #print mean of target in train and test
    print(train['target'].mean(), test['target'].mean())
print ("Balanced target")
#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())


#print shapes
print(train.shape)
print(test.shape)

In [None]:
#check if there is any duplicate in PATNO
print(train['PATNO'].duplicated().sum())
print(test['PATNO'].duplicated().sum())

print(train.shape)
print(test.shape)

train_columns = train.columns
test_columns = test.columns

if train_columns.equals(test_columns):
    print("Columns are the same")

for column in train_columns:
    print(column)

In [None]:
#sum all correct columns (that contain "CORRECT")

#print mean and std of total_correct
print(train['TOTAL_CORRECT'].mean(), test['TOTAL_CORRECT'].mean())
#MIN AND MAX
print(train['TOTAL_CORRECT'].min(), test['TOTAL_CORRECT'].min())
print(train['TOTAL_CORRECT'].max(), test['TOTAL_CORRECT'].max())
#print mean and std of TOTAL_CORRECT per target
print(train.groupby('target')['TOTAL_CORRECT'].mean())
print(test.groupby('target')['TOTAL_CORRECT'].mean())
#print mean and std of TOTAL_CORRECT
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
#now per HYPOSMIA
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
 


In [None]:

# Save the train and test sets to new CSV files
train.to_csv('data/processed/train_data_without_imput.csv', index=False)
test.to_csv('data/processed/test_data_without_imput.csv', index=False)

print(f"Train set created with {len(train)} instances.")
print(f"Test set created with {len(test)} instances.")

NOT FILTERING FOR HYPOSMIA

In [None]:
from sklearn.model_selection import train_test_split
from database import filter_hyposmia
import pandas as pd
from scipy.spatial import KDTree
import numpy as np

# Load the datasets
data_large = pd.read_csv('data/processed/remote_processed.csv', low_memory=False) #target = 0
data_small = pd.read_csv('data/processed/clinical_processed_imputed.csv', low_memory=False) #target = 1

print(len(data_large))
print(len(data_small))

# Filter hyposmia
#data_large = filter_hyposmia(data_large, '2023')
#data_small = filter_hyposmia(data_small, '2023')

age_var = 'AGE'
sex_var = 'BIRTHSEX'

# Ensure both datasets have age_var and sex_var columns
assert age_var in data_large.columns and sex_var in data_large.columns
assert age_var in data_small.columns and sex_var in data_small.columns

In [None]:
#TODO pegar um dos 10 mais próximos.

# Create a new DataFrame to store the balanced dataset
balanced_data = pd.DataFrame()

# Set to keep track of used indices
used_indices = set()

# Loop through unique combinations of age and sex in the smaller dataset
for (age, sex), group in data_small.groupby([age_var, sex_var]):
    # Find all matching rows in the larger dataset by sex
    matching_rows = data_large[data_large[sex_var] == sex]
    
    if len(matching_rows) > 0:
        # Build a KDTree for the current sex group
        kd_tree = KDTree(matching_rows[[age_var]].values)
        
        # Find the closest age values using KDTree
        _, indices = kd_tree.query([[age]], k=len(matching_rows))
        closest_indices = matching_rows.index[indices.flatten()]
        
        # Filter out already used indices
        closest_indices = [idx for idx in closest_indices if idx not in used_indices]
        
        # If there are not enough unique closest indices, limit to available ones
        closest_indices = closest_indices[:len(group)]
        
        # Update the set of used indices
        used_indices.update(closest_indices)
        
        # Get the sampled rows
        sampled_rows = data_large.loc[closest_indices]
        
        # Append the sampled rows to the balanced dataset
        balanced_data = pd.concat([balanced_data, sampled_rows])
    else:
        # If no matching rows are found, you might want to log this or handle it differently
        print(f"No matching rows found for sex {sex}")

# Reset the index of the balanced dataset
balanced_data.reset_index(drop=True, inplace=True)

print(f"Balanced dataset created with {len(balanced_data)*2} instances.")


In [None]:
print(balanced_data.shape)
print(data_small.shape)

In [None]:
#print mean and std of age and sex
print(balanced_data['AGE'].mean(), data_small['AGE'].mean())
print(balanced_data['AGE'].std(), data_small['AGE'].std())
print(balanced_data['BIRTHSEX'].mean(), data_small['BIRTHSEX'].mean())
print(balanced_data['BIRTHSEX'].std(), data_small['BIRTHSEX'].std())


In [None]:
#check for dupicate in PATNO in both datasets
print(balanced_data['PATNO'].duplicated().sum())
print(data_small['PATNO'].duplicated().sum())
#print which patno is duplicate on data_small
print(data_small[data_small['PATNO'].duplicated()])

#drop duplicates
data_small = data_small.drop_duplicates(subset='PATNO')

In [None]:

remote = pd.read_csv('data/processed/remote_processed.csv')
from tqdm import tqdm

#check, for each PATNO in data_balanced, if it is in remote, and if it has the equal values in all columns
for i in tqdm(range(len(balanced_data))):
    patno = balanced_data['PATNO'].iloc[i]
    if patno in remote['PATNO'].values:
        #check if values are equal for all columns
        for col in balanced_data.columns:
            if col != 'PATNO':
                #ignore if both are nan
                if pd.isnull(balanced_data[col].iloc[i]) and pd.isnull(remote[remote['PATNO'] == patno][col].values[0]):
                    continue
                if balanced_data[col].iloc[i] != remote[remote['PATNO'] == patno][col].values[0]:
                    print(f"Row {i} has different value in column {col}")
                    #print values
                    print(balanced_data[col].iloc[i], remote[remote['PATNO'] == patno][col].values[0])
    else:
        print(f"Row {i} has no matching PATNO {patno} in remote")

In [None]:
# Load the datasets
data_large = balanced_data
data_small = pd.read_csv('data/processed/clinical_processed_imputed.csv')

#drop duplicates
data_small = data_small.drop_duplicates(subset='PATNO')

#filter for HYPOSMIA =1
#data_large = data_large[data_large['HYPOSMIA'] == 1]
#data_small = data_small[data_small['HYPOSMIA'] == 1]

# Add a target column to each dataset
data_large['target'] = 0
data_small['target'] = 1

# Combine the datasets
combined_data = pd.concat([data_large, data_small])

# Reset index of combined data
combined_data.reset_index(drop=True, inplace=True)

# Define the columns for stratification
age_var = 'AGE'
sex_var = 'BIRTHSEX'
target_var = 'target'

# Ensure both datasets have age_var, sex_var, and target_var columns
assert age_var in combined_data.columns and sex_var in combined_data.columns and target_var in combined_data.columns

# Calculate percentiles for the age variable
percentiles = np.percentile(combined_data[age_var], [20, 40, 60, 80])
labels = ['0-20th', '21-40th', '41-60th', '61-80th', '81-100th']
combined_data['age_binned'] = pd.cut(combined_data[age_var], bins=[combined_data[age_var].min()-1, *percentiles, combined_data[age_var].max()+1], labels=labels, include_lowest=True)

# Create a stratification key
combined_data['stratify_key'] = combined_data.apply(lambda row: f"{row['age_binned']}_{row[sex_var]}_{row[target_var]}", axis=1)

# Split the data into train and test sets
train, test = train_test_split(combined_data, test_size=0.2, stratify=combined_data['stratify_key'], random_state=42)

# Drop the stratification key and binned age column
train = train.drop(columns=['stratify_key', 'age_binned'])
test = test.drop(columns=['stratify_key', 'age_binned'])

#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())

#switch random patients from train to test until target is balanced
while abs(train['target'].mean() - test['target'].mean()) > 0.01:
    #get random patient from train
    random_patient = train.sample()
    #switch to test
    test = pd.concat([test, random_patient])
    train = train.drop(random_patient.index)
    #now switch from test to train
    random_patient = test.sample()
    train = pd.concat([train, random_patient])
    test = test.drop(random_patient.index)
    #print mean of target in train and test
    print(train['target'].mean(), test['target'].mean())
print ("Balanced target")
#print mean, std and count of sex, age and target in train and test
print(train['BIRTHSEX'].mean(), test['BIRTHSEX'].mean())
print(train['BIRTHSEX'].std(), test['BIRTHSEX'].std())
print(train['BIRTHSEX'].count(), test['BIRTHSEX'].count())
print(train['AGE'].mean(), test['AGE'].mean())
print(train['AGE'].std(), test['AGE'].std())
print(train['AGE'].count(), test['AGE'].count())
print(train['target'].mean(), test['target'].mean())
print(train['target'].std(), test['target'].std())
print(train['target'].count(), test['target'].count())


#print shapes
print(train.shape)
print(test.shape)

In [None]:
#check if there is any duplicate in PATNO
print(train['PATNO'].duplicated().sum())
print(test['PATNO'].duplicated().sum())

print(train.shape)
print(test.shape)

train_columns = train.columns
test_columns = test.columns

if train_columns.equals(test_columns):
    print("Columns are the same")

for column in train_columns:
    print(column)

In [None]:
#sum all correct columns (that contain "CORRECT")

#print mean and std of total_correct
print(train['TOTAL_CORRECT'].mean(), test['TOTAL_CORRECT'].mean())
#MIN AND MAX
print(train['TOTAL_CORRECT'].min(), test['TOTAL_CORRECT'].min())
print(train['TOTAL_CORRECT'].max(), test['TOTAL_CORRECT'].max())
#print mean and std of TOTAL_CORRECT per target
print(train.groupby('target')['TOTAL_CORRECT'].mean())
print(test.groupby('target')['TOTAL_CORRECT'].mean())
#print mean and std of TOTAL_CORRECT
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].min())
#now per HYPOSMIA
print(train.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
print(test.groupby('HYPOSMIA')['TOTAL_CORRECT'].max())
 


In [None]:
# Save the train and test sets to new CSV files
train.to_csv('data/processed/train_data_all.csv', index=False)
test.to_csv('data/processed/test_data_all.csv', index=False)

print(f"Train set created with {len(train)} instances.")
print(f"Test set created with {len(test)} instances.")