### Main dataset creation for patient similarity trajectory plot 

In [None]:
"""
Script to create main dataset for patient similarity trajectory plot.
The dataset is created by merging the original dataset with the computed cluster labels after clustering.
And further this will be used to create 12 sub dataset based on age category, gender and presence and absence of ADHD in primary diagnosis.
"""

import pandas as pd

# Read the dataset with cluster labels
cluster_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper2/Github/ClusteredDataset/Cluster3Label_Full_ICD10_ATC_Dummies_ICD10_ATC_20.csv"
)
# Read the dataset with original dataset
original_df = pd.read_csv("/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv")
print(cluster_df.columns)
print(original_df.columns)

# Merge the original dataset with the cluster labels
trajectory_df = original_df[
    [
        "pasient",
        "episode_id",
        "episode_start_date",
        "episode_end_date",
        "gender",
        "age",
        "age_group",
        "diagnosis",
        "actual_med_Full_ATC",
        "Length_of_Episode",
        "Count_visit",
        "Therapy_ratio",
        "tillnextepisode",
    ]
].merge(
    cluster_df[["episode_id", "cluster", "cluster_distances"]],
    on="episode_id",
    how="inner",
)

display(trajectory_df.head(5))
print(trajectory_df.shape)
print(trajectory_df.columns)

# Save the dataset
trajectory_df.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/Trajectory_Full_ICD10_ATC_Cluster3Label.csv",
    index=False,
)

#### Using above Trajectory_Full_ICD10_ATC_Cluster3Label.csv. Encode ADHD_NoADHD column based on presence and absence of F90 as 1 for presence in first episode and 0 for absence.


In [None]:
"""
Create a new column `ADHD_NoADHD`, if `F90` diagnosis is present in first episode of contact then encode all episodes of that patient `ADHD_NoADHD` column as 1. 
But if no `F90` or first diagnosis has `NaN` then encode it as 0.
After this dataset and all the required columns are created, we will create 12 sub datasets based on .....
"""

import pandas as pd
import ast

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/Trajectory_Full_ICD10_ATC_Cluster3Label.csv"
)


# Calculate total unique patients in trajectory_df['pasient']
print(trajectory_df["pasient"].nunique())
print(trajectory_df.shape)
print(trajectory_df.columns)

# Convert 'episode_start_date' to datetime
trajectory_df["episode_start_date"] = pd.to_datetime(
    trajectory_df["episode_start_date"]
)

# Sort DataFrame by 'episode_start_date'
trajectory_df.sort_values(by="episode_start_date", inplace=True)

# Create a dictionary to store the first diagnosis of each patient
first_diagnosis = {}

for index, row in trajectory_df.iterrows():
    patient_id = row["pasient"]
    if patient_id not in first_diagnosis:
        diagnosis_list = (
            ast.literal_eval(row["diagnosis"]) if not pd.isna(row["diagnosis"]) else []
        )
        if any(diag.startswith("F90") for diag in diagnosis_list):
            first_diagnosis[patient_id] = 1
        else:
            first_diagnosis[patient_id] = 0
    trajectory_df.at[index, "ADHD_NoADHD"] = first_diagnosis[patient_id]

print(trajectory_df.shape)
print(trajectory_df.columns)
# set_options to display all columns and rows
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
# Display the only those rows in which the 'pasient' is not unique and repeated
verify_ADHD_NO_ADHD_Encoding = trajectory_df[
    trajectory_df["pasient"].duplicated(keep=False)
]
display(
    verify_ADHD_NO_ADHD_Encoding[
        [
            "pasient",
            "episode_id",
            "episode_start_date",
            "episode_end_date",
            "diagnosis",
            "ADHD_NoADHD",
        ]
    ].head(105)
)

# Save the dataset with ADHD_NoADHD column
trajectory_df.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv",
    index=False,
)

### All the possible combinations are 12 for AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"] || Gender = ["M", "F"] || ADHD_NoADHD = [0, 1] are :

1. Preschooler, Male, ADHD: (Preschooler, M, 1)

2. Preschooler, Male, No ADHD: (Preschooler, M, 0)

3. Preschooler, Female, ADHD: (Preschooler, F, 1)

4. Preschooler, Female, No ADHD: (Preschooler, F, 0)

5. MiddleChildhood, Male, ADHD: (MiddleChildhood, M, 1)

6. MiddleChildhood, Male, No ADHD: (MiddleChildhood, M, 0)

7. MiddleChildhood, Female, ADHD: (MiddleChildhood, F, 1)

8. MiddleChildhood, Female, No ADHD: (MiddleChildhood, F, 0)

9. Teenager, Male, ADHD: (Teenager, M, 1)

10. Teenager, Male, No ADHD: (Teenager, M, 0)

11. Teenager, Female, ADHD: (Teenager, F, 1)

12. Teenager, Female, No ADHD: (Teenager, F, 0)

#### 1. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = PreSchooler and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

display(trajectory_df.head(15))
# Count total number of patients with ADHD_NoADHD as 1
count_df = trajectory_df[trajectory_df["ADHD_NoADHD"] == 1]
print(len(count_df))

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD_NoADHD as 1 and save it in a new dataframe preSchooler_M_ADHD
preSchooler_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(preSchooler_M_ADHD.shape)
display(preSchooler_M_ADHD.head(5))
# Save the dataset
preSchooler_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_M_ADHD.csv",
    index=False,
)

#### 2. Create a Dataset containing only those rows which satify this condition Age = M, AgeGroup = PreSchooler and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD_NoADHD as 0 and save it in a new dataframe preSchooler_M_NoADHD
preSchooler_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(preSchooler_M_NoADHD.shape)
display(preSchooler_M_NoADHD.head(5))
# Save the dataset
preSchooler_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_M_NoADHD.csv",
    index=False,
)

AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"]
Gender = ["M", "F", "0"]
ADHD_NoADHD = [0, 1]

#### 3. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD_NoADHD as 1 and save it in a new dataframe preSchooler_F_ADHD
preSchooler_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(preSchooler_F_ADHD.shape)
display(preSchooler_F_ADHD.head(5))
# Save the dataset
preSchooler_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_F_ADHD.csv",
    index=False,
)

#### 4. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD_NoADHD as 0 and save it in a new dataframe preSchooler_F_NoADHD
preSchooler_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(preSchooler_F_NoADHD.shape)
display(preSchooler_F_NoADHD.head(5))
# Save the dataset
preSchooler_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_F_NoADHD.csv",
    index=False,
)

#### 5. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD_NoADHD as 1 and save it in a new dataframe MiddleChildhood_M_ADHD
MiddleChildhood_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(MiddleChildhood_M_ADHD.shape)
display(MiddleChildhood_M_ADHD.head(5))
# Save the dataset
MiddleChildhood_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_M_ADHD.csv",
    index=False,
)

#### 6. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD_NoADHD as 0 and save it in a new dataframe MiddleChildhood_M_NoADHD
MiddleChildhood_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(MiddleChildhood_M_NoADHD.shape)
display(MiddleChildhood_M_NoADHD.head(5))
# Save the dataset
MiddleChildhood_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_M_NoADHD.csv",
    index=False,
)

#### 7. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD_NoADHD as 1 and save it in a new dataframe MiddleChildhood_F_ADHD
MiddleChildhood_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(MiddleChildhood_F_ADHD.shape)
display(MiddleChildhood_F_ADHD.head(5))
# Save the dataset
MiddleChildhood_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_F_ADHD.csv",
    index=False,
)

#### 8. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD_NoADHD as 0 and save it in a new dataframe MiddleChildhood_F_NoADHD
MiddleChildhood_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(MiddleChildhood_F_NoADHD.shape)
display(MiddleChildhood_F_NoADHD.head(5))
# Save the dataset
MiddleChildhood_F_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_F_NoADHD.csv",
    index=False,
)

#### 9. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_M_ADHD
Teenager_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(Teenager_M_ADHD.shape)
display(Teenager_M_ADHD.head(5))
# Save the dataset
Teenager_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_M_ADHD.csv",
    index=False,
)

#### 10. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_M_NoADHD
Teenager_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(Teenager_M_NoADHD.shape)
display(Teenager_M_NoADHD.head(5))
# Save the dataset
Teenager_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_M_NoADHD.csv",
    index=False,
)

#### 11. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD_NoADHD = 1

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_F_ADHD
Teenager_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(Teenager_F_ADHD.shape)
display(Teenager_F_ADHD.head(5))
# Save the dataset
Teenager_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_F_ADHD.csv",
    index=False,
)

#### 12. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD_NoADHD = 0

In [None]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD_NoADHD as 0 and save it in a new dataframe Teenager_F_NoADHD
Teenager_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(Teenager_F_NoADHD.shape)
display(Teenager_F_NoADHD.head(5))
# Save the dataset
Teenager_F_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_F_NoADHD.csv",
    index=False,
)

***Total sum = 256 + 1399 + 86 + 976 + 1791 + 3661 + 593 + 2214 + 1183 + 3608 + 724 + 6138 = 22629 (As Gender 0 is excluded)***