### Main dataset creation for Patient Similarity Trajectory Plot 

In [None]:
"""
Script to create main dataset for patient similarity trajectory plot.
The dataset is created by merging the original dataset with the computed cluster labels after clustering.
And further this will be used to create 12 sub dataset based on age group, gender and presence/absence of ADHD in patient's diagnostic history
"""
from dotenv import load_dotenv
import os
import pandas as pd

import ast

# Load environment variables from .env file
load_dotenv()

# Read the dataset with cluster labels
cluster_df = pd.read_csv(os.getenv('cluster_df_path'))
# Read the dataset with original dataset
original_df = pd.read_csv(os.getenv('orginal_df_path'))

# Merge the original dataset with the cluster labels
trajectory_df = original_df[
    [
        "pasient",
        "episode_id",
        "episode_start_date",
        "episode_end_date",
        "gender",
        "age_group",
        "diagnosis",
        "actual_med_Full_ATC",
        "Length_of_Episode",
        "Count_visit",
        "Therapy_ratio",
        "tillnextepisode",
    ]
].merge(
    cluster_df[["episode_id", "cluster"]], 
    on="episode_id",
    how="inner",
)

# Datset For the actual age of patient
age_dataset = pd.read_csv(
    os.getenv('age_dataset_path')
)

# Merge the trajectory dataset with the age dataset
trajectory_df = trajectory_df.merge(
    age_dataset[["episode_id", "fdt"]],
    on="episode_id",
    how="inner",
)


# Get age of patient by substracting episode_start_date from fdt and round it to 2 decimal places
trajectory_df["episode_start_date"] = pd.to_datetime(
    trajectory_df["episode_start_date"]
)
trajectory_df["fdt"] = pd.to_datetime(trajectory_df["fdt"])
trajectory_df["age"] = (
    trajectory_df["episode_start_date"] - trajectory_df["fdt"]
).dt.days
trajectory_df["age"] = round(trajectory_df["age"] / 365.2425, 2)

# Drop the fdt columns
trajectory_df.drop("fdt", axis=1, inplace=True)


display(trajectory_df.head(20))
print(trajectory_df.shape)
print(trajectory_df.columns)
# # Save the dataset
trajectory_df.to_csv(
    os.getenv('trajectory_df_path'),
    index=False,
)

#### Create main dataset with a column ADHD,  
#### If any of the episodes diagnosis of the patient contains F90 encode it as 1, 
#### If it contains all NaN in all the episodes of patient encode it as 2 , 
#### If it does not contain any F90, neither it has NaN in all the episodes, then encode it as 0

In [None]:

trajectory_df = pd.read_csv(
    os.getenv('trajectory_df_path')
)


# Function to determine the encoding for ADHD
def encode_adhd(diagnoses):
    # Check if any diagnosis contains 'F90'
    if any("F90" in str(d) for d in diagnoses):
        return 1
    # Other than, return 0
    else:
        return 0

# Group by 'pasient' and apply the encoding function to the 'diagnosis' column
trajectory_df["ADHD"] = trajectory_df.groupby("pasient")["diagnosis"].transform(
    encode_adhd
)

# Display the resulting dataframe
display(trajectory_df)

# Count total unique patients with 3 in ADHD
print(f"Total Patients: {trajectory_df['pasient'].nunique()}")
print(
    f"Patients with ADHD: {trajectory_df[trajectory_df['ADHD'] == 1]['pasient'].nunique()}"
)
print(
    f"Patients without ADHD or NaN: {trajectory_df[trajectory_df['ADHD'] == 0]['pasient'].nunique()}"
)


trajectory_df.to_csv(
    os.getenv('updated_trajectory_df_path'),
    index=False,
)

# Verify the dataset
print("\n")
trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)
print(f"On new Main Dataset, Total Episodes: {trajectory_df['episode_id'].nunique()}")

print(
    f"On new Main Dataset Total Number of Episodes: {trajectory_df['episode_id'].nunique()}"
)
print(
    f"On new Main Dataset Total Number of Pasients: {trajectory_df['pasient'].nunique()}"
)

print(
    f"Gender Distribution (As gender 0 is excluded in the study):{trajectory_df['gender'].value_counts()}"
)

### All the possible combinations are 12 for AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"] || Gender = ["M", "F"] || ADHD = [0, 1] are :

1. Preschooler, Male, ADHD: (Preschooler, M, 1)

2. Preschooler, Male, No ADHD: (Preschooler, M, 0)

3. Preschooler, Female, ADHD: (Preschooler, F, 1)

4. Preschooler, Female, No ADHD: (Preschooler, F, 0)

5. MiddleChildhood, Male, ADHD: (MiddleChildhood, M, 1)

6. MiddleChildhood, Male, No ADHD: (MiddleChildhood, M, 0)

7. MiddleChildhood, Female, ADHD: (MiddleChildhood, F, 1)

8. MiddleChildhood, Female, No ADHD: (MiddleChildhood, F, 0)

9. Teenager, Male, ADHD: (Teenager, M, 1)

10. Teenager, Male, No ADHD: (Teenager, M, 0)

11. Teenager, Female, ADHD: (Teenager, F, 1)

12. Teenager, Female, No ADHD: (Teenager, F, 0)

In [None]:
trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)
trajectory_df[['ADHD']].value_counts()

# 0       10436
# 2        6495
# 1        5745


#### 1. Create a Dataset containing only those rows which satisfy this condition Gender = M, AgeGroup = PreSchooler and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)
print(trajectory_df[["diagnosis"]].isna().sum())
display(trajectory_df.head(15))
# Count total number of patients with ADHD as 1
count_df = trajectory_df[trajectory_df["ADHD"] == 1]
print(len(count_df))

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD as 1 and save it in a new dataframe preSchooler_M_ADHD
preSchooler_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD"] == 1)
]
print(preSchooler_M_ADHD.shape)
display(preSchooler_M_ADHD.head(5))
# Save the dataset
preSchooler_M_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_PreSchooler_M_ADHD.csv",
    index=False
)

#### 2. Create a Dataset containing only those rows which satify this condition Age = M, AgeGroup = PreSchooler and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD as 0 and save it in a new dataframe preSchooler_M_NoADHD
preSchooler_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD"] == 0)
]
print(preSchooler_M_NoADHD.shape)
display(preSchooler_M_NoADHD.head(5))
# Save the dataset
preSchooler_M_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_PreSchooler_M_NoADHD.csv",
    index=False,
)

AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"]
Gender = ["M", "F", "0"]
ADHD = [0, 1]

#### 3. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD as 1 and save it in a new dataframe preSchooler_F_ADHD
preSchooler_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD"] == 1)
]
print(preSchooler_F_ADHD.shape)
display(preSchooler_F_ADHD.head(5))
# Save the dataset
preSchooler_F_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_PreSchooler_F_ADHD.csv",
    index=False,
)

#### 4. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD as 0 and save it in a new dataframe preSchooler_F_NoADHD
preSchooler_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD"] == 0)
]
print(preSchooler_F_NoADHD.shape)
display(preSchooler_F_NoADHD.head(5))
# Save the dataset
preSchooler_F_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_PreSchooler_F_NoADHD.csv",
    index=False,
)

#### 5. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD as 1 and save it in a new dataframe MiddleChildhood_M_ADHD
MiddleChildhood_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD"] == 1)
]
print(MiddleChildhood_M_ADHD.shape)
display(MiddleChildhood_M_ADHD.head(5))
# Save the dataset
MiddleChildhood_M_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_MiddleChildhood_M_ADHD.csv",
    index=False,
)

#### 6. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD as 0 and save it in a new dataframe MiddleChildhood_M_NoADHD
MiddleChildhood_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD"] == 0)
]
print(MiddleChildhood_M_NoADHD.shape)
display(MiddleChildhood_M_NoADHD.head(5))
# Save the dataset
MiddleChildhood_M_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_MiddleChildhood_M_NoADHD.csv",
    index=False,
)

#### 7. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD as 1 and save it in a new dataframe MiddleChildhood_F_ADHD
MiddleChildhood_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD"] == 1)
]
print(MiddleChildhood_F_ADHD.shape)
display(MiddleChildhood_F_ADHD.head(5))
# Save the dataset
MiddleChildhood_F_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_MiddleChildhood_F_ADHD.csv",
    index=False,
)

#### 8. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD as 0 and save it in a new dataframe MiddleChildhood_F_NoADHD
MiddleChildhood_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD"] == 0)
]
print(MiddleChildhood_F_NoADHD.shape)
display(MiddleChildhood_F_NoADHD.head(5))
# Save the dataset
MiddleChildhood_F_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_MiddleChildhood_F_NoADHD.csv",
    index=False,
)

#### 9. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD as 1 and save it in a new dataframe Teenager_M_ADHD
Teenager_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD"] == 1)
]
print(Teenager_M_ADHD.shape)
display(Teenager_M_ADHD.head(5))
# Save the dataset
Teenager_M_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_Teenager_M_ADHD.csv",
    index=False,
)

#### 10. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD as 1 and save it in a new dataframe Teenager_M_NoADHD
Teenager_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD"] == 0)
]
print(Teenager_M_NoADHD.shape)
display(Teenager_M_NoADHD.head(5))
# Save the dataset
Teenager_M_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_Teenager_M_NoADHD.csv",
    index=False,
)

#### 11. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD as 1 and save it in a new dataframe Teenager_F_ADHD
Teenager_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD"] == 1)
]
print(Teenager_F_ADHD.shape)
display(Teenager_F_ADHD.head(5))
# Save the dataset
Teenager_F_ADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_Teenager_F_ADHD.csv",
    index=False,
)

#### 12. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    os.getenv('updated_trajectory_df_path')
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD as 0 and save it in a new dataframe Teenager_F_NoADHD
Teenager_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD"] == 0)
]
print(Teenager_F_NoADHD.shape)
display(Teenager_F_NoADHD.head(5))
# Save the dataset
Teenager_F_NoADHD.to_csv(
    "/home/kabank/work/workbench/dipendrp/NetworkGraph/outputs/SubDataset_Teenager_F_NoADHD.csv",
    index=False,
)

* Total Number of records in dataframes of all combinations = 386+1269+120+942+2158+3294+745+2062+1409+3382+927+5935 = 22629 
* Total Number of records in the initial dataframe = 22676 - 47(Gender 0 which is excluded) = 22629