### Main dataset creation for patient similarity trajectory plot 

In [1]:
"""
Script to create main dataset for patient similarity trajectory plot.
The dataset is created by merging the original dataset with the computed cluster labels after clustering.
And further this will be used to create 12 sub dataset based on age category, gender and presence and absence of ADHD in primary diagnosis.
"""

import pandas as pd

# Read the dataset with cluster labels
cluster_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper2/Github/ClusteredDataset/Cluster3Label_Full_ICD10_ATC_Dummies_ICD10_ATC_20.csv"
)
# Read the dataset with original dataset
original_df = pd.read_csv("/mnt/work/workbench/dipendrp/new-data/Full_ICD10_ATC.csv")

# Merge the original dataset with the cluster labels
trajectory_df = original_df[
    [
        "pasient",
        "episode_id",
        "episode_start_date",
        "episode_end_date",
        "gender",
        "age_group",
        "diagnosis",
        "actual_med_Full_ATC",
        "Length_of_Episode",
        "Count_visit",
        "Therapy_ratio",
        "tillnextepisode",
    ]
].merge(
    cluster_df[["episode_id", "cluster", "cluster_distances"]],
    on="episode_id",
    how="inner",
)

# Datset For the actual age of patient
age_dataset = pd.read_csv(
    "/mnt/work/workbench/dipendrp/new-data/Patient_episode_fdt.csv"
)

# Merge the original dataset with the cluster labels
trajectory_df = trajectory_df.merge(
    age_dataset[["episode_id", "patient_age", "fdt"]],
    on="episode_id",
    how="inner",
)


# Get age of patient by substracting episode_start_date from fdt and round it to 2 decimal places
trajectory_df["episode_start_date"] = pd.to_datetime(
    trajectory_df["episode_start_date"]
)
trajectory_df["fdt"] = pd.to_datetime(trajectory_df["fdt"])
trajectory_df["age"] = (
    trajectory_df["episode_start_date"] - trajectory_df["fdt"]
).dt.days
trajectory_df["age"] = trajectory_df["age"] / 365.2425

# Drop the fdt columns
trajectory_df.drop("fdt", axis=1, inplace=True)


display(trajectory_df.head(20))
print(trajectory_df.shape)
print(trajectory_df.columns)
# # Save the dataset
trajectory_df.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/Trajectory_Full_ICD10_ATC_Cluster3Label.csv",
    index=False,
)

Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age
0,2,3836.0,2013-07-02,2015-07-03,F,Teenager,"['F930', 'F431']",,732,49,0.5,,1,3.075655,12,12.238444
1,4,25641.0,2015-09-28,2017-10-04,M,Teenager,"['F429', 'F422']",,738,160,0.612,,1,3.245776,13,13.738817
2,6,24660.0,2004-10-01,2007-07-06,M,MiddleChildhood,,,1009,98,0.5,,1,3.218743,6,6.00423
3,7,22380.0,2015-09-28,2016-10-06,M,MiddleChildhood,['H932'],,375,80,0.293,,1,2.978143,8,8.739399
4,8,28188.0,2002-12-30,2005-01-09,F,Teenager,['F321'],,742,52,0.553,,1,3.281965,16,16.983237
5,9,17749.0,2016-07-05,2018-01-04,F,Teenager,"['G430', 'F4322', 'G431']",,549,97,0.556,,1,3.72014,15,15.745703
6,10,7441.0,2008-12-29,2009-03-30,F,Teenager,,,92,41,0.357,,0,7.384782,17,17.492488
7,11,29685.0,2009-03-30,2012-07-06,M,MiddleChildhood,['F845'],,1195,72,0.306,,1,3.471296,6,6.494315
8,12,195.0,2013-07-02,2014-12-30,F,Preschooler,,,547,45,0.533,,1,3.088366,3,3.482618
9,13,23747.0,2003-09-30,2003-12-30,M,MiddleChildhood,['C910'],,92,3,0.667,,1,4.178732,6,6.236952


(22676, 16)
Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age'],
      dtype='object')


#### Create main dataset with a column ADHD_NoADHD, encode it as 1 if any of the episodes diagnosis of the patient contains F90, if it contains all NaN in all the episode of patient encode it as 2 , but if it does not contains any F90, neither it has NaN in all the episodes, then encode it as 3 

In [4]:
import pandas as pd
import ast

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/Trajectory_Full_ICD10_ATC_Cluster3Label.csv"
)


# Function to determine the encoding for ADHD_NoADHD
def encode_adhd(diagnoses):
    # Check if all diagnoses are NaN
    if all(pd.isna(d) for d in diagnoses):
        return 2
    # Check if any diagnosis contains 'F90'
    elif any("F90" in str(d) for d in diagnoses):
        return 1
    # If neither of the above, return 3
    else:
        return 0


# Group by 'pasient' and apply the encoding function to the 'diagnosis' column
trajectory_df["ADHD_NoADHD"] = trajectory_df.groupby("pasient")["diagnosis"].transform(
    encode_adhd
)

# Display the resulting dataframe
display(trajectory_df)

# Count total unique patients with 3 in ADHD_NoADHD
print(f"Total Patients: {trajectory_df['pasient'].nunique()}")
print(
    f"Patients with ADHD: {trajectory_df[trajectory_df['ADHD_NoADHD'] == 1]['pasient'].nunique()}"
)
print(
    f"Patients without ADHD: {trajectory_df[trajectory_df['ADHD_NoADHD'] == 0]['pasient'].nunique()}"
)
print(
    f"Patients with NaN: {trajectory_df[trajectory_df['ADHD_NoADHD'] == 2]['pasient'].nunique()}"
)

trajectory_df.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv",
    index=False,
)

# Verify the dataset
print("\n")
trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)
print(f"On new Main Dataset, Toal Episodes: {trajectory_df['episode_id'].nunique()}")
# Select only those row which has 0 or 1 in ADHD_NoADHD column and save it in new dataframe 0_1_trajectory_df
trajectory_df = trajectory_df[trajectory_df["ADHD_NoADHD"].isin([0, 1])]
print(
    f"On new Main Dataset Total Episodes with ADHD and Other diagnosis only: {trajectory_df['episode_id'].nunique()}"
)
print(
    f"On new Main Dataset Total Pasient with ADHD and Other diagnosis only: {trajectory_df['pasient'].nunique()}"
)

print(
    f"Gender Distribution (As gender 0 is excluded in the study):{trajectory_df['gender'].value_counts()}"
)

Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
0,2,3836.0,2013-07-02,2015-07-03,F,Teenager,"['F930', 'F431']",,732,49,0.500,,1,3.075655,12,12.238444,0
1,4,25641.0,2015-09-28,2017-10-04,M,Teenager,"['F429', 'F422']",,738,160,0.612,,1,3.245776,13,13.738817,0
2,6,24660.0,2004-10-01,2007-07-06,M,MiddleChildhood,,,1009,98,0.500,,1,3.218743,6,6.004230,2
3,7,22380.0,2015-09-28,2016-10-06,M,MiddleChildhood,['H932'],,375,80,0.293,,1,2.978143,8,8.739399,0
4,8,28188.0,2002-12-30,2005-01-09,F,Teenager,['F321'],,742,52,0.553,,1,3.281965,16,16.983237,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22671,22553,7950.0,2007-01-01,2010-07-06,F,Teenager,"['E66', 'E669', 'F331', 'F432', 'F332', 'F6030...",,1283,390,0.523,,2,7.030308,15,15.250142,0
22672,22595,16793.0,2010-10-05,2011-10-05,F,Teenager,,,366,18,0.556,,1,3.524008,15,15.742965,2
22673,22603,18636.0,2005-10-04,2008-10-01,F,Teenager,,,1094,522,0.428,,0,7.937477,15,15.740227,2
22674,22604,7521.0,2002-12-30,2003-04-06,M,Teenager,,,98,31,0.414,,0,7.191046,12,12.736743,2


Total Patients: 19248
Patients with ADHD: 4411
Patients without ADHD: 8731
Patients with NaN: 6106


On new Main Dataset, Toal Episodes: 22676
On new Main Dataset Total Episodes with ADHD and Other diagnosis only: 16181
On new Main Dataset Total Pasient with ADHD and Other diagnosis only: 13142
Gender Distribution (As gender 0 is excluded in the study):gender
M    8524
F    7628
0      29
Name: count, dtype: int64


### All the possible combinations are 12 for AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"] || Gender = ["M", "F"] || ADHD_NoADHD = [0, 1] are :

1. Preschooler, Male, ADHD: (Preschooler, M, 1)

2. Preschooler, Male, No ADHD: (Preschooler, M, 0)

3. Preschooler, Female, ADHD: (Preschooler, F, 1)

4. Preschooler, Female, No ADHD: (Preschooler, F, 0)

5. MiddleChildhood, Male, ADHD: (MiddleChildhood, M, 1)

6. MiddleChildhood, Male, No ADHD: (MiddleChildhood, M, 0)

7. MiddleChildhood, Female, ADHD: (MiddleChildhood, F, 1)

8. MiddleChildhood, Female, No ADHD: (MiddleChildhood, F, 0)

9. Teenager, Male, ADHD: (Teenager, M, 1)

10. Teenager, Male, No ADHD: (Teenager, M, 0)

11. Teenager, Female, ADHD: (Teenager, F, 1)

12. Teenager, Female, No ADHD: (Teenager, F, 0)

#### 1. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = PreSchooler and ADHD_NoADHD = 1

In [5]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)
print(trajectory_df[["diagnosis"]].isna().sum())
display(trajectory_df.head(15))
# Count total number of patients with ADHD_NoADHD as 1
count_df = trajectory_df[trajectory_df["ADHD_NoADHD"] == 1]
print(len(count_df))

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD_NoADHD as 1 and save it in a new dataframe preSchooler_M_ADHD
preSchooler_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(preSchooler_M_ADHD.shape)
display(preSchooler_M_ADHD.head(5))
# Save the dataset
preSchooler_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_M_ADHD.csv",
    index=False,
)

diagnosis    7833
dtype: int64


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
0,2,3836.0,2013-07-02,2015-07-03,F,Teenager,"['F930', 'F431']",,732,49,0.5,,1,3.075655,12,12.238444,0
1,4,25641.0,2015-09-28,2017-10-04,M,Teenager,"['F429', 'F422']",,738,160,0.612,,1,3.245776,13,13.738817,0
2,6,24660.0,2004-10-01,2007-07-06,M,MiddleChildhood,,,1009,98,0.5,,1,3.218743,6,6.00423,2
3,7,22380.0,2015-09-28,2016-10-06,M,MiddleChildhood,['H932'],,375,80,0.293,,1,2.978143,8,8.739399,0
4,8,28188.0,2002-12-30,2005-01-09,F,Teenager,['F321'],,742,52,0.553,,1,3.281965,16,16.983237,0
5,9,17749.0,2016-07-05,2018-01-04,F,Teenager,"['G430', 'F4322', 'G431']",,549,97,0.556,,1,3.72014,15,15.745703,0
6,10,7441.0,2008-12-29,2009-03-30,F,Teenager,,,92,41,0.357,,0,7.384782,17,17.492488,2
7,11,29685.0,2009-03-30,2012-07-06,M,MiddleChildhood,['F845'],,1195,72,0.306,,1,3.471296,6,6.494315,0
8,12,195.0,2013-07-02,2014-12-30,F,Preschooler,,,547,45,0.533,,1,3.088366,3,3.482618,2
9,13,23747.0,2003-09-30,2003-12-30,M,MiddleChildhood,['C910'],,92,3,0.667,,1,4.178732,6,6.236952,0


5745
Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(386, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
34,45,10696.0,2006-10-04,2010-10-07,M,Preschooler,['F900'],"['N06BA04', 'N06BA09']",1465,168,0.442,,2,3.512981,5,5.243092,1
118,138,2426.0,2008-12-30,2016-07-06,M,Preschooler,['F901'],"[nan, 'N06B A12', 'N06BA04', 'N05AX08', 'N06BA...",2746,529,0.594,,2,6.129504,5,5.741391,1
226,256,2282.0,2007-10-02,2010-01-06,M,Preschooler,,,828,58,0.607,2464.0,1,3.297322,0,0.002738,1
327,359,22281.0,2011-01-03,2013-01-02,M,Preschooler,['F900'],"['N06BA04', nan, 'A06B A04', 'N06B A04']",731,98,0.343,181.0,2,4.038424,4,4.988467,1
359,394,7144.0,2008-03-31,2009-06-29,M,Preschooler,['F900'],,456,111,0.555,462.0,1,3.509009,5,5.492241,1


#### 2. Create a Dataset containing only those rows which satify this condition Age = M, AgeGroup = PreSchooler and ADHD_NoADHD = 0

In [6]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as PreSchooler and ADHD_NoADHD as 0 and save it in a new dataframe preSchooler_M_NoADHD
preSchooler_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(preSchooler_M_NoADHD.shape)
display(preSchooler_M_NoADHD.head(5))
# Save the dataset
preSchooler_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_M_NoADHD.csv",
    index=False,
)

AgeGroup = ["Preschooler", "MiddleSchool", "Teenager"]
Gender = ["M", "F", "0"]
ADHD_NoADHD = [0, 1]

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(648, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
60,75,24227.0,2008-03-31,2013-04-02,M,Preschooler,['F802'],,1829,102,0.321,,1,3.185217,4,4.249232,0
77,92,23785.0,2003-10-01,2004-10-01,M,Preschooler,"['F800', 'F802', 'F801']",,367,28,0.179,,1,3.221068,5,5.492241,0
95,113,5954.0,2015-06-29,2016-01-07,M,Preschooler,"['T751', 'I639']",,193,19,0.85,,1,4.297443,3,3.734505,0
110,130,5173.0,2009-06-29,2009-07-01,M,Preschooler,['P073'],,3,8,0.714,,1,4.599696,0,0.246412,0
195,218,1877.0,2001-07-03,2003-01-03,M,Preschooler,,,550,70,0.6,3104.0,1,3.838374,4,4.241018,0


#### 3. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD_NoADHD = 1

In [7]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD_NoADHD as 1 and save it in a new dataframe preSchooler_F_ADHD
preSchooler_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(preSchooler_F_ADHD.shape)
display(preSchooler_F_ADHD.head(5))
# Save the dataset
preSchooler_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_F_ADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(120, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
19,23,25034.0,2005-04-04,2017-04-07,F,Preschooler,"['E669', 'F900']","['N06BA04', 'C02A C02', 'N06B A12']",4387,195,0.462,,2,5.193396,4,4.241018,1
72,89,19066.0,2010-10-04,2015-04-03,F,Preschooler,['F900'],,1643,165,0.412,459.0,1,3.603729,3,3.994606,1
168,190,3979.0,2012-01-02,2013-07-02,F,Preschooler,,[nan],548,54,0.404,917.0,1,3.151793,5,5.002156,1
296,325,2786.0,2013-12-30,2018-01-01,F,Preschooler,"['F809', 'P073', 'F900', 'F431']",,1464,256,0.402,,2,4.472112,5,4.999418,1
533,585,20579.0,2003-12-29,2015-01-02,F,Preschooler,"['F320', 'F900', 'S626', 'F3200']","['N06BA04', 'N06B A04']",4023,512,0.498,,2,6.297763,4,4.974777,1


#### 4. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = PreSchooler and ADHD_NoADHD = 0

In [8]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as PreSchooler and ADHD_NoADHD as 0 and save it in a new dataframe preSchooler_F_NoADHD
preSchooler_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Preschooler")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(preSchooler_F_NoADHD.shape)
display(preSchooler_F_NoADHD.head(5))
# Save the dataset
preSchooler_F_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_PreSchooler_F_NoADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(493, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
31,38,14508.0,2009-09-28,2018-01-05,F,Preschooler,['F438'],,3022,4,0.5,,1,5.101219,4,4.99668,0
119,141,20055.0,2008-06-30,2009-03-30,F,Preschooler,['T740'],,274,5,0.6,1008.0,1,3.991845,3,3.994606,0
166,188,2385.0,2014-12-30,2015-01-02,F,Preschooler,['J219'],,4,7,1.0,,1,5.360441,2,2.245084,0
177,198,18813.0,2006-10-02,2009-04-02,F,Preschooler,['F940'],,914,43,0.476,1096.0,1,2.999606,4,4.495643,0
308,339,13564.0,2011-07-04,2011-10-03,F,Preschooler,"['N394', 'K590']",,92,11,0.273,1831.0,1,3.997127,4,4.747531,0


#### 5. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD_NoADHD = 1

In [9]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD_NoADHD as 1 and save it in a new dataframe MiddleChildhood_M_ADHD
MiddleChildhood_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(MiddleChildhood_M_ADHD.shape)
display(MiddleChildhood_M_ADHD.head(5))
# Save the dataset
MiddleChildhood_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_M_ADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(2158, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
40,56,4564.0,2002-01-02,2007-01-03,M,MiddleChildhood,['F900'],,1828,69,0.156,1552.0,1,3.801527,7,7.745539,1
46,61,21502.0,2005-04-05,2008-07-03,M,MiddleChildhood,"['F901', 'F900', 'F402']",,1186,94,0.429,550.0,1,3.817561,7,7.263667,1
55,71,21212.0,2012-01-02,2016-04-05,M,MiddleChildhood,"['F901', 'F928', 'F431']",,1556,330,0.472,,2,4.788727,9,9.243174,1
57,73,10527.0,2017-10-02,2018-01-03,M,MiddleChildhood,['F900'],,94,30,0.533,,1,3.852103,10,10.004312,1
74,90,1880.0,2009-06-29,2016-04-07,M,MiddleChildhood,"['F941', 'F900', 'F952']","['N06BA04', nan, 'N05CH01']",2475,582,0.569,270.0,2,5.993091,7,7.23355,1


#### 6. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = MiddleChildhood and ADHD_NoADHD = 0

In [10]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as MiddleChildhood and ADHD_NoADHD as 0 and save it in a new dataframe MiddleChildhood_M_NoADHD
MiddleChildhood_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(MiddleChildhood_M_NoADHD.shape)
display(MiddleChildhood_M_NoADHD.head(5))
# Save the dataset
MiddleChildhood_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_M_NoADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(1917, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
3,7,22380.0,2015-09-28,2016-10-06,M,MiddleChildhood,['H932'],,375,80,0.293,,1,2.978143,8,8.739399,0
7,11,29685.0,2009-03-30,2012-07-06,M,MiddleChildhood,['F845'],,1195,72,0.306,,1,3.471296,6,6.494315,0
9,13,23747.0,2003-09-30,2003-12-30,M,MiddleChildhood,['C910'],,92,3,0.667,,1,4.178732,6,6.236952,0
18,21,30919.0,2014-07-01,2015-06-30,M,MiddleChildhood,"['S064', 'S068', 'F430']",,365,6,0.857,,1,3.909657,7,7.244502,0
48,62,27370.0,2010-04-05,2011-07-07,M,MiddleChildhood,['F930'],,459,68,0.382,,1,3.09418,7,7.994688,0


#### 7. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD_NoADHD = 1

In [11]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD_NoADHD as 1 and save it in a new dataframe MiddleChildhood_F_ADHD
MiddleChildhood_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(MiddleChildhood_F_ADHD.shape)
display(MiddleChildhood_F_ADHD.head(5))
# Save the dataset
MiddleChildhood_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_F_ADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(745, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
20,24,29718.0,2016-07-04,2017-10-06,F,MiddleChildhood,['F900'],"['A06B A04', 'N06B A04']",460,170,0.257,,2,3.734809,9,9.243174,1
73,89,29303.0,2016-07-05,2017-04-05,F,MiddleChildhood,['F900'],['N06BA04'],275,19,0.35,,1,3.56154,9,9.746949,1
104,122,9201.0,2011-01-04,2013-12-30,F,MiddleChildhood,"['F810', 'F900', 'F811']","['N06BA04', 'A06B A04', 'N06B A04']",1092,83,0.212,,2,3.67473,11,11.989295,1
169,190,11694.0,2016-01-05,2018-01-05,F,MiddleChildhood,"['F900', 'F83']",,732,146,0.361,,1,3.443416,9,9.010452,1
283,311,29116.0,2007-10-01,2018-01-05,F,MiddleChildhood,"['F938', 'F900', 'F813', 'F444', 'F942']",['N06BA04'],3750,938,0.55,,2,8.272947,7,7.986475,1


#### 8. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = MiddleChildhood and ADHD_NoADHD = 0

In [12]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as MiddleChildhood and ADHD_NoADHD as 0 and save it in a new dataframe MiddleChildhood_F_NoADHD
MiddleChildhood_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "MiddleChildhood")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(MiddleChildhood_F_NoADHD.shape)
display(MiddleChildhood_F_NoADHD.head(5))
# Save the dataset
MiddleChildhood_F_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_MiddleChildhood_F_NoADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(1287, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
44,59,26617.0,2016-07-07,2017-01-06,F,MiddleChildhood,['Q763'],,184,60,0.414,,1,2.996707,9,9.506013,0
54,70,18925.0,2017-04-03,2018-01-03,F,MiddleChildhood,['F4323'],,276,128,0.5,,1,3.274589,11,11.233632,0
81,96,21172.0,2012-12-31,2013-01-04,F,MiddleChildhood,['M911'],,5,3,1.0,,1,5.282097,9,9.495062,0
117,137,18698.0,2007-12-31,2008-01-02,F,MiddleChildhood,"['C710', 'H540']",,3,4,0.25,,1,5.468341,10,10.237034,0
120,141,16545.0,2012-01-02,2016-10-06,F,MiddleChildhood,['F431'],,1740,138,0.765,,1,3.579674,7,7.501865,0


#### 9. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD_NoADHD = 1

In [13]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_M_ADHD
Teenager_M_ADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(Teenager_M_ADHD.shape)
display(Teenager_M_ADHD.head(5))
# Save the dataset
Teenager_M_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_M_ADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(1409, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
16,19,9632.0,2010-01-06,2014-04-04,M,Teenager,"['J459', 'F900']","['N06BA04', 'A06B A04', 'N06B A04']",1550,91,0.345,,2,3.57501,12,12.512235,1
23,28,8994.0,2009-03-30,2012-10-03,M,Teenager,['F900'],['N06BA04'],1284,61,0.226,,1,4.209794,12,12.495808,1
41,56,17034.0,2011-04-04,2015-04-02,M,Teenager,['F900'],,1460,36,0.314,,1,3.139767,16,16.996927,1
47,61,17553.0,2010-01-04,2011-10-07,M,Teenager,['F900'],,642,42,0.349,,1,3.160132,12,12.013936,1
52,66,20326.0,2011-01-03,2015-09-28,M,Teenager,"['F845', 'F951', 'F900']",['N05AX08'],1730,227,0.436,,2,3.933213,13,13.259684,1


#### 10. Create a Dataset containing only those rows which satify this condition Gender = M, AgeGroup = Teenager and ADHD_NoADHD = 0

In [14]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as M, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_M_NoADHD
Teenager_M_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "M")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(Teenager_M_NoADHD.shape)
display(Teenager_M_NoADHD.head(5))
# Save the dataset
Teenager_M_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_M_NoADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(2006, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
1,4,25641.0,2015-09-28,2017-10-04,M,Teenager,"['F429', 'F422']",,738,160,0.612,,1,3.245776,13,13.738817,0
25,30,28483.0,2014-07-01,2014-12-30,M,Teenager,['F845'],,183,26,0.259,,1,3.272122,16,16.739563,0
45,60,6519.0,2017-04-03,2017-04-07,M,Teenager,['M321'],,5,10,0.583,,1,4.249278,16,16.249478,0
63,78,30371.0,2007-12-31,2011-10-06,M,Teenager,['F952'],['N05AX08'],1376,237,0.606,,2,3.66436,17,16.999665,0
64,80,7444.0,1999-07-06,2000-10-06,M,Teenager,['F952'],,459,11,0.6,,1,3.337967,12,12.509497,0


#### 11. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD_NoADHD = 1

In [15]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD_NoADHD as 1 and save it in a new dataframe Teenager_F_ADHD
Teenager_F_ADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 1)
]
print(Teenager_F_ADHD.shape)
display(Teenager_F_ADHD.head(5))
# Save the dataset
Teenager_F_ADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_F_ADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(927, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
13,16,16500.0,2014-12-30,2017-01-06,F,Teenager,['F900'],"['N06BA04', 'N06B A04']",739,80,0.525,,1,3.430326,15,15.233715,1
38,51,21292.0,2010-01-04,2014-01-01,F,Teenager,['F900'],"[nan, 'N06BA04', 'N06BA09']",1459,378,0.511,,2,4.088345,14,14.743629,1
42,57,15134.0,2013-01-01,2013-10-04,F,Teenager,['F900'],,277,56,0.268,,1,3.345563,17,17.48975,1
66,82,21098.0,2014-07-03,2015-10-02,F,Teenager,['F900'],"['A06B A04', 'N06B A12', 'N06B A04']",457,124,0.597,,2,3.772939,13,13.733342,1
138,159,28648.0,2016-04-04,2017-04-05,F,Teenager,"['F900', 'X6n0']",,367,188,0.489,,1,3.947959,14,14.754581,1


#### 12. Create a Dataset containing only those rows which satify this condition Gender = F, AgeGroup = Teenager and ADHD_NoADHD = 0

In [16]:
import pandas as pd

trajectory_df = pd.read_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/MainDataset_Patient_Trajectory_ICD10_3_Cluster.csv"
)

print(trajectory_df.columns)
# Select only those data from trajectory_df which has Gender as F, AgeGroup as Teenager and ADHD_NoADHD as 0 and save it in a new dataframe Teenager_F_NoADHD
Teenager_F_NoADHD = trajectory_df[
    (trajectory_df["gender"] == "F")
    & (trajectory_df["age_group"] == "Teenager")
    & (trajectory_df["ADHD_NoADHD"] == 0)
]
print(Teenager_F_NoADHD.shape)
display(Teenager_F_NoADHD.head(5))
# Save the dataset
Teenager_F_NoADHD.to_csv(
    "/mnt/work/workbench/dipendrp/Paper3/Data/SubDataset_Teenager_F_NoADHD.csv",
    index=False,
)

Index(['pasient', 'episode_id', 'episode_start_date', 'episode_end_date',
       'gender', 'age_group', 'diagnosis', 'actual_med_Full_ATC',
       'Length_of_Episode', 'Count_visit', 'Therapy_ratio', 'tillnextepisode',
       'cluster', 'cluster_distances', 'patient_age', 'age', 'ADHD_NoADHD'],
      dtype='object')
(4056, 17)


Unnamed: 0,pasient,episode_id,episode_start_date,episode_end_date,gender,age_group,diagnosis,actual_med_Full_ATC,Length_of_Episode,Count_visit,Therapy_ratio,tillnextepisode,cluster,cluster_distances,patient_age,age,ADHD_NoADHD
0,2,3836.0,2013-07-02,2015-07-03,F,Teenager,"['F930', 'F431']",,732,49,0.5,,1,3.075655,12,12.238444,0
4,8,28188.0,2002-12-30,2005-01-09,F,Teenager,['F321'],,742,52,0.553,,1,3.281965,16,16.983237,0
5,9,17749.0,2016-07-05,2018-01-04,F,Teenager,"['G430', 'F4322', 'G431']",,549,97,0.556,,1,3.72014,15,15.745703,0
11,15,7847.0,2009-06-30,2011-07-08,F,Teenager,['F500'],,739,329,0.56,361.0,1,4.291713,13,13.987967,0
12,15,26114.0,2012-07-03,2013-04-01,F,Teenager,,,273,31,0.484,,1,3.467001,16,16.996927,0



***Total sum = 386 + 648 + 120 + 493 + 2158 + 1917 + 745 + 1287 + 1409 + 2006 + 927 + 4056 = 22629 (As Gender 0 is excluded which has 29)***
sum = 386 + 648 + 120 + 493 + 2158 + 1917 + 745 + 1287 + 1409 + 2006 + 927 + 4056 + 29
sum