In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import csv,os

import warnings
# warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('CancerData0.csv')
needed_columns = ['patient_ID', 'drug_date', 'drug', 'days_of_supply', 'class', 'drug_start_date', 'drug_end_date']
# stmt to replace the col in input dataset to the names of above cols
cancer_data = data[needed_columns].copy()
print(cancer_data.dtypes)

patient_ID         object
drug_date          object
drug               object
days_of_supply     object
class              object
drug_start_date    object
drug_end_date      object
dtype: object


In [None]:
cancer_data

Unnamed: 0,patient_ID,drug_date,drug,days_of_supply,class,drug_start_date,drug_end_date
0,PATID_001,5-Dec-13,PACLITAXEL,.,CHEMO,12/5/13,12/5/13
1,PATID_001,5-Dec-13,CARBOPLATIN,.,CHEMO,12/5/13,12/5/13
2,PATID_001,31-Dec-13,CARBOPLATIN,.,CHEMO,12/31/13,12/31/13
3,PATID_001,31-Dec-13,PACLITAXEL,.,CHEMO,12/31/13,12/31/13
4,PATID_001,30-Mar-14,CARBOPLATIN,.,CHEMO,3/30/14,3/30/14
5,PATID_001,10-Jan-14,LETROZOLE,30,HORMO,1/10/14,1/10/14
6,PATID_001,30-Mar-14,PACLITAXEL,.,CHEMO,3/30/14,3/30/14
7,PATID_001,25-Oct-14,PACLITAXEL,.,CHEMO,10/25/14,10/25/14
8,PATID_001,25-Oct-14,CARBOPLATIN,.,CHEMO,10/25/14,10/25/14
9,PATID_001,30-Jan-15,CARBOPLATIN,.,CHEMO,1/30/15,1/30/15


In [None]:
try:

  # converting string date of type 19NOV2015 to datetime of date 2015-11-19
  cancer_data['drug_date'] = pd.to_datetime(cancer_data['drug_date'], format = '%d-%b-%y', dayfirst = True) #,errors = 'coerce'

  # replacing '.' in days of supply with 0 and convert the column to integer
  cancer_data.loc[cancer_data['days_of_supply'] == '.', 'days_of_supply'] = '0'
  cancer_data['days_of_supply'] = cancer_data['days_of_supply'].astype(int)
  #print(f"new column days of supply is \n{cancer_data['days_of_supply']}\n\n")

  print(f'\n------------------------------\ncancer_data after replacing . in days of supply and converting drug_date to type 19NOV2015 is\n ')
  print(f'{cancer_data}\n\n\n')

  # Converting date strings to datetime date
  date_cols = ['drug_start_date', 'drug_end_date']
  for i in date_cols:
    print(f'i = {i}\n')
    cancer_data[i] = pd.to_datetime(cancer_data[i], format = '%m/%d/%y', errors = 'coerce') #,errors = 'coerce'

  print(f'\n------------------------------\ncancer_data after Converting date strings to datetime date is\n ')
  print(f'{cancer_data}\n\n\n')

  # If a drug only has the prescribed date but lacks an end date, for example, intravenous
  # drugs, the drug_start_date and drug_end_date could be set as the same as the prescribed date
  # and if the drug is a oral drug then the end date would be supply days more than prescribed date.
  # note that oral drug have mentioned days of supply
  # however intravenous drugs do not have days of supply

  for i in range(len(cancer_data)):

    # checking for intravenous drugs
    if cancer_data['days_of_supply'].iloc[i] == 0 and pd.isna(cancer_data['drug_start_date'].iloc[i]) and pd.isna(cancer_data['drug_end_date'].iloc[i]):

      # the drug_start_date and drug_end_date could be set as the same as the prescribed date
      cancer_data['drug_start_date'] = cancer_data['drug_start_date'].fillna(cancer_data['drug_date'])
      cancer_data['drug_end_date'] = cancer_data['drug_end_date'].fillna(cancer_data['drug_date'])

    # checking oral drugs
    elif cancer_data['days_of_supply'].iloc[i] != 0 and pd.isna(cancer_data['drug_start_date'].iloc[i]) and pd.isna(cancer_data['drug_end_date'].iloc[i]):

      # if the drug is a oral drug then the start date would be the prescribed date
      # and the end date would be supply days more than prescribed date.
      cancer_data['drug_start_date'] = cancer_data['drug_date'].fillna(cancer_data['drug_date'])
      cancer_data['drug_end_date'] = cancer_data['drug_date'] + pd.to_timedelta(cancer_data['days_of_supply'], unit = 'D')


  cancer_data['drug_start_date'] = pd.to_datetime(cancer_data['drug_start_date'])
  cancer_data['drug_end_date'] = pd.to_datetime(cancer_data['drug_end_date'])
  print('\n*******************************')
  print('\ndrug_start_date and drug_end_date after converting to null rows to dates with reference to prescribed date are\n')
  print(cancer_data[['drug_start_date', 'drug_end_date']])

  # checking if the diff of drug end date and drug start dates is not equal to days of supply then
  # drug end date is sum of days of supply and drug start date
  for i in range(len(cancer_data)):

    if (cancer_data.loc[i, 'drug_end_date'] - cancer_data.loc[i, 'drug_start_date']).days != cancer_data.loc[i, 'days_of_supply']:
      # cancer_data.loc[i, 'drug_start_date'] = cancer_data.loc[i, 'drug_date']
      cancer_data.loc[i, 'drug_end_date'] = cancer_data.loc[i, 'drug_start_date'] + pd.to_timedelta(cancer_data.loc[i, 'days_of_supply'], unit = 'D')

  print('\ndrug_start_date and drug_end_date after checking if the diff of drug end date and drug start dates is not equal to days of supply are\n')
  print(cancer_data[['drug_start_date', 'drug_end_date']])


except Exception as e:
  print('\n*******************************************************************************')
  print(f'exceptin is\n{e}')


------------------------------
cancer_data after replacing . in days of supply and converting drug_date to type 19NOV2015 is
 
   patient_ID  drug_date         drug  days_of_supply  class drug_start_date  \
0   PATID_001 2013-12-05   PACLITAXEL               0  CHEMO         12/5/13   
1   PATID_001 2013-12-05  CARBOPLATIN               0  CHEMO         12/5/13   
2   PATID_001 2013-12-31  CARBOPLATIN               0  CHEMO        12/31/13   
3   PATID_001 2013-12-31   PACLITAXEL               0  CHEMO        12/31/13   
4   PATID_001 2014-03-30  CARBOPLATIN               0  CHEMO         3/30/14   
5   PATID_001 2014-01-10    LETROZOLE              30  HORMO         1/10/14   
6   PATID_001 2014-03-30   PACLITAXEL               0  CHEMO         3/30/14   
7   PATID_001 2014-10-25   PACLITAXEL               0  CHEMO        10/25/14   
8   PATID_001 2014-10-25  CARBOPLATIN               0  CHEMO        10/25/14   
9   PATID_001 2015-01-30  CARBOPLATIN               0  CHEMO         1/3

In [None]:
try: # Initialize an empty list to store the output data
  output_list = []

  # Set the gap and window days
  gap_days = 45
  window_days = 29

  # sorting the cancer_data DataFrame
  cancer_data = cancer_data.sort_values(by = ['patient_ID', 'drug_start_date', 'drug_end_date'])

  # creating an empty dictionary to store drug names for each lot_num
  lot_drug_dict = {}

  # creating an empty dictionary to store cumulative drug names for each lot_num
  cumulative_drug_dict = {}

  # Iterate over each patient in the dataset
  for patient in cancer_data['patient_ID'].unique():
      patient_data = cancer_data[cancer_data['patient_ID'] == patient].copy().reset_index(drop=True)

      # initializing lot num to 1
      lot_num = 1

      # getting the start date of the first drug for the patient and assigning it to the lot start date
      lot_start_date = patient_data.loc[0, 'drug_start_date']

      # creating the cumulative drug dictionary for the patient for getting the summary data
      cumulative_drug_dict = {lot_num: set()}

      # Iterating over each record in the patient's data
      for i, current_record in patient_data.iterrows():
          current_record_dict = current_record.to_dict()
          current_record_dict['lot_num'] = lot_num

          # Checking if it's the first record in the patient's data
          if i == 0:
              gap = 0
              current_record_dict.update({'type1': 0, 'type2': 0, 'gap': gap, 'flag': 0, 'type': 0})
          else:

              # setting i-1th record as previous record
              # Calculating the gap between the current record and the previous record
              prev_record = patient_data.loc[i-1]
              gap = (current_record['drug_start_date'] - prev_record['drug_end_date']).days
              # assigning gap day for current record
              current_record_dict['gap'] = gap


              # checking if difference between the days is great than gap days
              if gap > gap_days:
                  lot_num += 1
                  lot_start_date = current_record['drug_start_date']
                  cumulative_drug_dict[lot_num] = set()
                  current_record_dict.update({'lot_num': lot_num, 'type1': 1, 'type2': 0})

              # checking if difference between the days is less than window days
              elif (current_record['drug_start_date'] - lot_start_date).days <= window_days:

                  # checking if the current class is not in regimen and changing type2 to 1
                  if current_record['class'] not in cumulative_drug_dict[lot_num]:
                      current_record_dict.update({'type1': 0, 'type2': 1})
                  else:

                      # else if the drug is alreayd in regimen,setting the type1 and type2 flag as 0
                      current_record_dict.update({'type1': 0, 'type2': 0})
              else:

                  # updating the current record's type1 and type2 flag as 0
                  current_record_dict.update({'type1': 0, 'type2': 0})

              # Setting flag and type to 1 if it's not the first LOT or if a new drug is added
              if lot_num > 1 or current_record_dict['type2'] == 1:
                  current_record_dict['flag'] = 1
                  current_record_dict['type'] = 1
              else:
                  current_record_dict['flag'] = 0
                  current_record_dict['type'] = 0

          # if gap dyas is less than window days, then adding the drug irrespective of class to the record in summary data
          if (current_record['drug_start_date'] - lot_start_date).days <= window_days:
              cumulative_drug_dict[lot_num].add(current_record['drug'])

          current_record_dict['cumulative_drug_names'] = ' + '.join(sorted(cumulative_drug_dict[lot_num]))
          output_list.append(current_record_dict)

  # Convert the list of dictionaries to a DataFrame
  output_data = pd.DataFrame(output_list)

  # Save the output to CSV
  output_data.to_csv('output_data.csv', index=False)

  # Group the data by 'patient_id' and 'lot_num' to process each lot for each patient.
  regimen_data = output_data.groupby(by = ['patient_ID','lot_num'])

  # Creating a list to store aggregated data
  summary_data = []

  # For each group, extract the first date, combine drug names, and generate the line number.
  for (patient_ID, lot_num), group in regimen_data:
      first_date = group['drug_start_date'].min()
      drugs = ' + '.join(group['drug'].unique())
      class_drug = ' + '.join(group['class'].unique())
      line_num = lot_num

      # appending the line details to result_data
      summary_data.append({
          'patient_ID': patient_ID,
          'line_start_date': first_date,
          'line_regimen': drugs,
          'trgt_category_regimen': class_drug,
          'line_num': line_num
      })

  # Construct new DataFrame summary_df using the aggregated data.
  summary_df = pd.DataFrame(summary_data)
  summary_df.to_csv('summary_details.csv', index=False)

  # Print the summary_df to check the result
  print(summary_df)

except Exception as e:
  print('\n*******************************************************************************')
  print(f'exceptin is\n{e}')

   patient_ID line_start_date                          line_regimen  \
0   PATID_001      2013-12-05  PACLITAXEL + CARBOPLATIN + LETROZOLE   
1   PATID_001      2014-03-30              CARBOPLATIN + PACLITAXEL   
2   PATID_001      2014-10-25              PACLITAXEL + CARBOPLATIN   
3   PATID_001      2015-01-30              CARBOPLATIN + PACLITAXEL   
4   PATID_001      2015-03-19              PACLITAXEL + CARBOPLATIN   
5   PATID_001      2015-07-14              CARBOPLATIN + PACLITAXEL   
6   PATID_001      2015-09-08              CARBOPLATIN + PACLITAXEL   
7   PATID_001      2015-12-29              CARBOPLATIN + PACLITAXEL   
8   PATID_001      2016-06-14              CARBOPLATIN + PACLITAXEL   
9   PATID_001      2016-10-08              CARBOPLATIN + PACLITAXEL   
10  PATID_001      2016-12-29              CARBOPLATIN + PACLITAXEL   
11  PATID_001      2017-03-19              PACLITAXEL + CARBOPLATIN   
12  PATID_001      2017-05-19              PACLITAXEL + CARBOPLATIN   
13  PA

In [None]:
print(output_data)

   patient_ID  drug_date         drug  days_of_supply  class drug_start_date  \
0   PATID_001 2013-12-05   PACLITAXEL               0  CHEMO      2013-12-05   
1   PATID_001 2013-12-05  CARBOPLATIN               0  CHEMO      2013-12-05   
2   PATID_001 2013-12-31  CARBOPLATIN               0  CHEMO      2013-12-31   
3   PATID_001 2013-12-31   PACLITAXEL               0  CHEMO      2013-12-31   
4   PATID_001 2014-01-10    LETROZOLE              30  HORMO      2014-01-10   
5   PATID_001 2014-03-30  CARBOPLATIN               0  CHEMO      2014-03-30   
6   PATID_001 2014-03-30   PACLITAXEL               0  CHEMO      2014-03-30   
7   PATID_001 2014-10-25   PACLITAXEL               0  CHEMO      2014-10-25   
8   PATID_001 2014-10-25  CARBOPLATIN               0  CHEMO      2014-10-25   
9   PATID_001 2015-01-30  CARBOPLATIN               0  CHEMO      2015-01-30   
10  PATID_001 2015-01-30   PACLITAXEL               0  CHEMO      2015-01-30   
11  PATID_001 2015-03-19   PACLITAXEL   