## 1- import and reading data


In [43]:
import sys

sys.path.append('../../../scripts/')
from utilities.helper_functions import read_files
from data_preprocessing.data_cleaning import *

In [44]:
base_path='../../../data/raw_data/'
df_demographic, df_diet, df_examination, df_labs, df_questionnaire = \
    read_files('demographic.csv', 'diet.csv', 'examination.csv', 'labs.csv',
               'questionnaire.csv',base_path=base_path)

## 2- handle missing values

**2.1**  Deleting the entire column with many missing value(M)
**2.2**  Imputing the Missing Value


<span style="color:orange">2.1- Deleting the entire column </span>
>when:
 >>1 - missing value in type Missing At Random (MAR) or Missing Completely At Random (MCAR)
 >>2- column has many missing value

In [45]:
threshold_percentage = 50
df_demographic_columns_to_drop = extract_columns_by_threshold(df_demographic, threshold_percentage)
df_demographic=df_demographic.drop(columns=df_demographic_columns_to_drop)
df_demographic.shape

(10175, 41)

In [46]:
df_diet_columns_to_drop = extract_columns_by_threshold(df_diet, threshold_percentage)
df_diet=df_diet.drop(columns=df_diet_columns_to_drop)
df_diet.shape

(9813, 112)

In [47]:
df_examination_columns_to_drop = extract_columns_by_threshold(df_examination, threshold_percentage)
df_examination=df_examination.drop(columns=df_examination_columns_to_drop)
df_examination.shape

(9813, 113)

In [48]:
df_labs_columns_to_drop = extract_columns_by_threshold(df_labs, threshold_percentage)
df_labs=df_labs.drop(columns=df_labs_columns_to_drop)
df_labs.shape

(9813, 152)

In [49]:
df_questionnaire_columns_to_drop = extract_columns_by_threshold(df_questionnaire, threshold_percentage)
df_questionnaire=df_questionnaire.drop(columns=df_questionnaire_columns_to_drop)
df_questionnaire.shape

(10175, 3)

<span style="color:orange">2.2- Imputing the Missing Value </span>
>when:
 >>1 - missing value in type Missing Not At Random (MNAR) in this case (In the cells that used compounds 7 and 9, they express this type)
 >>2- Cells that were not removed by applying threshold in the previous step

In [50]:
import pandas as pd
import re
# Creating a sample DataFrame
data = {
    'Patient_ID': [1, 2, 3, 4, 5],
    'Compound_7': [7, 77, 777, 8, 9],
    'Compound_9': [9, 99, 999, 888, 777],
    'Usability': ['acceptable', 'not_acceptable', 'acceptable', 'not_acceptable', 'acceptable']
}

df = pd.DataFrame(data)

# Define a function to check if a value is a sequence of repeating 7s or 9s
def is_repeating_sequence(value):
    return bool(re.match(r'^(7+|9+)$', str(value)))

# Apply the function to each cell in 'Compound_7' and 'Compound_9'
compound_7_cells = df[df['Compound_7'].apply(is_repeating_sequence)]
compound_9_cells = df[df['Compound_9'].apply(is_repeating_sequence)]

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Display cells with a repeating sequence of 7s in 'Compound_7'
print("\nCells with a repeating sequence of 7s in 'Compound_7':")
print(compound_7_cells)

# Display cells with a repeating sequence of 9s in 'Compound_9'
print("\nCells with a repeating sequence of 9s in 'Compound_9':")
print(compound_9_cells)

Original DataFrame:
   Patient_ID  Compound_7  Compound_9       Usability
0           1           7           9      acceptable
1           2          77          99  not_acceptable
2           3         777         999      acceptable
3           4           8         888  not_acceptable
4           5           9         777      acceptable

Cells with a repeating sequence of 7s in 'Compound_7':
   Patient_ID  Compound_7  Compound_9       Usability
0           1           7           9      acceptable
1           2          77          99  not_acceptable
2           3         777         999      acceptable
4           5           9         777      acceptable

Cells with a repeating sequence of 9s in 'Compound_9':
   Patient_ID  Compound_7  Compound_9       Usability
0           1           7           9      acceptable
1           2          77          99  not_acceptable
2           3         777         999      acceptable
4           5           9         777      acceptable
