In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
from datetime import datetime

# 1) Loading the dataset
Loading the dataset and removing columns that are not needed.

In [None]:
# load the ED visits dataset with all the features
df_event_log_visits = pd.read_csv('df_visits_dur_timeofday-randomorder.csv')

In [None]:
df_event_log_visits.info()

In [None]:
# dropping columns that were added earlier for analytics
df_event_log_visits.drop(['Earliest_Time'], axis=1, inplace=True)

In [None]:
df_event_log_visits.info()

In [None]:
# Rename the Start_Time column to Activity_End_Time - there is only one time stamp in the dataset which is assumed to be
# when the activity finished
df_event_log_visits.rename(columns={'Start_Time': 'Activity_End_Time'}, inplace=True)

# Drop the End_Time column
df_event_log_visits.drop(columns=['End_Time'], inplace=True)


In [None]:

# Sort the DataFrame by VISIT_ID and Activity_End_Time
df_event_log_visits.sort_values(by=['VISIT_ID', 'Activity_End_Time'], inplace=True)

# Infer Start_Time for each activity using the Activity_End_Time of the previous activity
df_event_log_visits['Start_Time'] = df_event_log_visits['Activity_End_Time'].shift(1)

# Ensure that the Start_Time is not carried over from the last activity of the previous VISIT_ID to the first activity of the next VISIT_ID
df_event_log_visits.loc[df_event_log_visits['VISIT_ID'] != df_event_log_visits['VISIT_ID'].shift(1), 'Start_Time'] = pd.NaT

# For the first activity in each sequence, set Start_Time equal to Activity_End_Time, indicating zero duration
df_event_log_visits.loc[df_event_log_visits['Start_Time'].isna(), 'Start_Time'] = df_event_log_visits['Activity_End_Time']


In [None]:
# these are the remaining columns in the dataset as of now
df_event_log_visits.info()

In [None]:

# Convert the 'Duration' column to timedelta
df_event_log_visits['Duration'] = pd.to_timedelta(df_event_log_visits['Duration'])

# Rename the column and convert the values to hours
df_event_log_visits['Duration_hours'] = df_event_log_visits['Duration'].dt.total_seconds() / 3600

# Drop the old 'Duration' column
df_event_log_visits = df_event_log_visits.drop(columns=['Duration'])


In [None]:

# filtering out the columns I move to the end
columns = list(df_event_log_visits.columns)
columns.remove('Activity_End_Time')
columns.remove('Duration_hours')


# Appending them
columns = columns + ['Activity_End_Time', 'Duration_hours']

# Reordring based on the new column order
df_event_log_visits = df_event_log_visits[columns]

df_event_log_visits.head()

In [None]:
df_event_log_visits.info()


# 2) Dealing with missing values

## DOB

In [None]:
# DOB is the first column with missing values - there are in total for rows 4 missing values - these rows can be droppped

#  Removing records with missing DOB
df_event_log_visits = df_event_log_visits.dropna(subset=['DOB'])

df_event_log_visits.info()



# Next is DOD, however the missing value in this case indicates that the patient is still alive, hence getting rid of these values 
# would lead to losing important information - this column will be dealt with in later step.


## POSTAL_CODE

In [None]:
# Another column with missing values is the postal code, in this case I decided to fill in the values based on the value of the hospital
# that the patient visited

# Filtering the df to only include rows where ED_POSTAL_CODE is missing
missing_postal_code = df_event_log_visits[df_event_log_visits['ED_POSTAL_CODE'].isnull()]

# Printing the rows with missing ED_POSTAL_CODE
print(missing_postal_code)

In [None]:
# Counting occurrences of each FACILITY_ID in the DataFrame with missing postal codes
facility_counts = missing_postal_code['FACILITY_ID'].value_counts()

# Printing the counts
print(facility_counts)


In [None]:
facility_postal_map = {
    'HSC': 'A1B 3V6',
    'GBC': 'A5A 1K3',
    'SCM': 'A1C 5B8',
    'BUR': 'A0E 1E0',
    'CGH': 'A1Y 1A4'
}

# Applying the map to the 'FACILITY_ID' column to create a new 'Imputed_Postal_Code' column
df_event_log_visits['Imputed_Postal_Code'] = df_event_log_visits['FACILITY_ID'].map(facility_postal_map)

# Filling missing 'ED_POSTAL_CODE' values with the imputed values from 'Imputed_Postal_Code'
df_event_log_visits['ED_POSTAL_CODE'].fillna(df_event_log_visits['Imputed_Postal_Code'], inplace=True)

# dropping the column
df_event_log_visits.drop('Imputed_Postal_Code', axis=1, inplace=True)


In [None]:
# Checking if there are any remaining missing values in 'ED_POSTAL_CODE'
print(df_event_log_visits['ED_POSTAL_CODE'].isnull().sum())


In [None]:
df_event_log_visits.info()

## CTAS

In [None]:
# Next missing values are in the case of CTAS column 

# Histogram to see the distribution of CTAS scores
df_event_log_visits['CTAS'].hist(bins=5)
plt.title('Distribution of CTAS Scores')
plt.xlabel('CTAS Score')
plt.ylabel('Frequency')
plt.show()

# Descriptive statistics
print(df_event_log_visits['CTAS'].describe())


In [None]:
# Check for rows where CTAS is missing
missing_ctas_df = df_event_log_visits[df_event_log_visits['CTAS'].isna()]

# Display how many of these have 'TLWBS' or 'RLWBS' as the depart_disposition_id
missing_ctas_dispositions = missing_ctas_df['DEPART_DISPOSITION_ID'].value_counts()
print(missing_ctas_dispositions)


In [None]:

# Calculate the total counts of each DEPART_DISPOSITION_ID in the whole dataset
total_dispositions = df_event_log_visits['DEPART_DISPOSITION_ID'].value_counts()

# Calculate proportions for the missing CTAS group
missing_ctas_proportions = missing_ctas_dispositions / missing_ctas_dispositions.sum()

# Calculate proportions for the whole dataset
total_proportions = total_dispositions / total_dispositions.sum()

# Create a DataFrame for easy comparison
comparison_df = pd.DataFrame({
    'Missing_CTAS_Proportion': missing_ctas_proportions,
    'Total_Proportion': total_proportions
})

# Display the DataFrame
print(comparison_df)


In [None]:
# A significant proportion of 'RLWBS' (25.02%) and 'TLWBS' (6.41%) is in the missing CTAS dataset, this is higher compared
# to their proportions in the entire dataset (0.26% and 10.80% respectively). I will code it “9”.

# Coding missing CTAS values as 9
df_event_log_visits['CTAS'].fillna(9, inplace=True)


# Verifying no missing values
print(f"After imputation, remaining missing values in 'CTAS': {df_event_log_visits['CTAS'].isnull().sum()}")

# the distribution after imputation
df_event_log_visits['CTAS'].hist(bins=5)
plt.title('Distribution of CTAS Scores After Imputation')
plt.xlabel('CTAS Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
df_event_log_visits.info()

## REASON_FOR_VISIT 

In [None]:
df_event_log_visits['REASON_FOR_VISIT'].nunique()

In [None]:
## too many values - decided to drop the column  
df_event_log_visits.drop(columns=['REASON_FOR_VISIT'], inplace=True)


## PRESENTING_COMPLAINT 

In [None]:
df_event_log_visits['PRESENTING_COMPLAINT'].nunique()

In [None]:
# Drop rows with missing values in the PRESENTING_COMPLAINT column
df_event_log_visits.dropna(subset=['PRESENTING_COMPLAINT'], inplace=True)


In [None]:
df_event_log_visits.info()

## DEPART_DISPOSITION_ID

In [None]:
# Departing disposition is one of the values I will be predicting with the model 
# (TRIAGED LEFT W/O BEING SEEN and REGISTERED LEFT W/O BEING SEEN)
# hence I decided to not deal with the missing values at this moment - however, we only need one of the columns
df_event_log_visits['DEPART_DISPOSITION_DESC'].unique()

In [None]:
df_event_log_visits.drop(['DEPART_DISPOSITION_DESC'], axis=1, inplace=True)

In [None]:
dispositions_counts =  df_event_log_visits['DEPART_DISPOSITION_ID'].value_counts()

# Print the counts
print(dispositions_counts)

# 3) Preprocessing columns
In the next steps I am preprocessing and cleaning data in all columns that will be used by the deep learning model.

## SEX

In [None]:
# preprocessing the SEX column

df_event_log_visits['SEX'].value_counts()

In [None]:
# Encode 'M' as 0 and 'F' as 1 and 'U' as 2
df_event_log_visits['SEX'] = df_event_log_visits['SEX'].map({'M': 0, 'F': 1, 'U': 2})
df_event_log_visits

## DATE OF BIRTH

In [None]:
# preprocessing the DOB column

# converting the 'DOB' column to datetime
df_event_log_visits['DOB'] = pd.to_datetime(df_event_log_visits['DOB'])


## PRESENTING_COMPLAINT

In [None]:

from sklearn.preprocessing import LabelEncoder


# Apply label encoding
label_encoder = LabelEncoder()
df_event_log_visits['PRESENTING_COMPLAINT_ENCODED'] = label_encoder.fit_transform(df_event_log_visits['PRESENTING_COMPLAINT'])

# Drop the original PRESENTING_COMPLAINT column
df_event_log_visits.drop(columns=['PRESENTING_COMPLAINT'], inplace=True)

df_event_log_visits.head(5)

## 3.1) Creating new features based on the old features
Here, I am creating new features based on the available features in the dataset.

## DATE OF DEATH and IS_DECEASED

In [None]:
# Preprocessing the DOD column by creating a binary indicator from DOD
df_event_log_visits['Is_Deceased'] = df_event_log_visits['DOD'].notnull().astype(int)

df_event_log_visits['DOD'] = pd.to_datetime(df_event_log_visits['DOD'], errors='coerce')

df_event_log_visits.info()

In [None]:

# inspecting the unique values in 'DEPART_DISPOSITION_ID' when 'Is_Deceased' is True
deceased_dispositions = df_event_log_visits[df_event_log_visits['Is_Deceased'] == 1]['DEPART_DISPOSITION_ID'].value_counts()

# A cross-tabulation to see the relationship more clearly
crosstab = pd.crosstab(df_event_log_visits['Is_Deceased'], df_event_log_visits['DEPART_DISPOSITION_ID'])

print("Distribution of departure dispositions for deceased cases:")
print(deceased_dispositions)

print("\nCross-tabulation of 'Is_Deceased' and 'DEPART_DISPOSITION_ID':")
print(crosstab)


In [None]:

df_event_log_visits['DOD'] = pd.to_datetime(df_event_log_visits['DOD'], errors='coerce').dt.tz_localize(None).dt.tz_localize('UTC')

# comparisons
df_event_log_visits['Died_Before_Visit'] = (df_event_log_visits['DOD'] < df_event_log_visits['Start_Time']).astype(int)
df_event_log_visits['Died_During_Visit'] = ((df_event_log_visits['DOD'] >= df_event_log_visits['Start_Time']) & 
                                            (df_event_log_visits['DOD'] <= df_event_log_visits['Activity_End_Time'])).astype(int)
df_event_log_visits['Died_After_Visit'] = (df_event_log_visits['DOD'] > df_event_log_visits['Activity_End_Time']).astype(int)


In [None]:
# Filtering for rows where 'Is_Deceased' is 1 and display the first few rows
deceased_visits = df_event_log_visits[df_event_log_visits['Is_Deceased'] == 1]
deceased_visits.head()


In [None]:
# checking counts:
died_before_visit_count = df_event_log_visits[df_event_log_visits['Died_Before_Visit'] == 1].shape[0]
print("Number of cases where death occurred before the visit:", died_before_visit_count)

died_during_visit_count = df_event_log_visits[df_event_log_visits['Died_During_Visit'] == 1].shape[0]
print("Number of cases where death occurred during the visit:", died_during_visit_count)

died_after_visit_count = df_event_log_visits[df_event_log_visits['Died_After_Visit'] == 1].shape[0]
print("Number of cases where death occurred after the visit:", died_after_visit_count)


## AREA_TYPE

In [None]:
# identifying rural or urban area based on the postal code according to
# https://www.canadapost-postescanada.ca/cpc/en/support/articles/addressing-guidelines/postal-codes.page

# Function to determine if a postal code is urban or rural
def urban_rural_classifier(postal_code):
    # Check the second character of the postal code
    if postal_code[1] == '0':
        return 0  # Rural
    else:
        return 1  # Urban

# Apply the function to classify each postal code
df_event_log_visits['Area_Type'] = df_event_log_visits['ED_POSTAL_CODE'].apply(urban_rural_classifier)

# Verify by displaying the DataFrame
print(df_event_log_visits[['ED_POSTAL_CODE', 'Area_Type']].head())


In [None]:
df_event_log_visits.info()

## COORDINATES

In [None]:
# adding latutide and longitude table from geonames
file_path = 'CA_full.txt'
postal_codes = pd.read_csv(file_path, delimiter='\t', names=[
    'Country', 'Postal_Code', 'Place_Name', 'Province', 'Province_Code',
    'Unnamed5', 'Unnamed6', 'Unnamed7', 'Unnamed8', 'Latitude', 'Longitude', 'Extra'
], skiprows=1)  # skiprows=1 to skip the example header row you pasted

# Dropping the unnecessary columns
postal_codes.drop(columns=['Unnamed5', 'Unnamed6', 'Unnamed7', 'Unnamed8', 'Extra'], inplace=True)

# Displaying the dataframe to ensure it loaded correctly
print(postal_codes.head())


In [None]:
# checking for duplicates 

print(postal_codes['Postal_Code'].duplicated().sum())
postal_codes = postal_codes.drop_duplicates(subset=['Postal_Code'], keep='first')


In [None]:
# Merging the dataframes
df_event_log_visits = df_event_log_visits.merge(postal_codes, left_on='ED_POSTAL_CODE', right_on='Postal_Code', how='left')

# Checking for missing latitude and longitude after the merge
print(df_event_log_visits[['Latitude', 'Longitude']].isnull().sum())


In [None]:
df_event_log_visits.info()

In [None]:
# some of the postal codes can not be found on geonames as they are - the closest coordinates can be found by using
# only the first three digits
postal_codes['First_3_Digits'] = postal_codes['Postal_Code'].str[:3]


In [None]:
# rows where 'Latitude' or 'Longitude' is missing 
df_event_log_visits_missing_lat = df_event_log_visits[df_event_log_visits['Latitude'].isnull() | df_event_log_visits['Longitude'].isnull()]

# creating nnew column
df_event_log_visits_missing_lat['First_3_Digits'] = df_event_log_visits['ED_POSTAL_CODE'].str[:3]
df_event_log_visits_missing_lat

In [None]:

# Optimize data types
df_event_log_visits_missing_lat['First_3_Digits'] = df_event_log_visits_missing_lat['First_3_Digits'].astype('category')
postal_codes['First_3_Digits'] = postal_codes['First_3_Digits'].astype('category')

# Select only relevant columns for the merge
df_event_log_visits_missing_lat = df_event_log_visits_missing_lat[['First_3_Digits']]
postal_codes = postal_codes[['First_3_Digits', 'Latitude', 'Longitude']]




In [None]:
# Perform the merge
df_merged = df_event_log_visits_missing_lat.merge(postal_codes, on='First_3_Digits', how='left', suffixes=('', '_pc'))

# Display the merged DataFrame
print(df_merged.head())


In [None]:
# adding new column to the original dataset to merge on
df_event_log_visits['First_3_Digits'] = df_event_log_visits['ED_POSTAL_CODE'].str[:3]
df_event_log_visits.head()

In [None]:
# Find the rows in the main dataset where latitude and longitude are missing
missing_indices = df_event_log_visits[df_event_log_visits['Latitude'].isna() & df_event_log_visits['Longitude'].isna()].index

# Update the main dataset with the latitude and longitude from the merged DataFrame
df_event_log_visits.loc[missing_indices, 'Latitude'] = df_merged['Latitude']
df_event_log_visits.loc[missing_indices, 'Longitude'] = df_merged['Longitude']

# Check if the missing values are updated
print(df_event_log_visits.loc[missing_indices, ['First_3_Digits', 'Latitude', 'Longitude']])


In [None]:
# an overview of all missing values in the DataFrame
print(df_event_log_visits.info())


In [None]:
# Dropping rows where either Latitude or Longitude is missing (3 rows)
df_event_log_visits = df_event_log_visits.dropna(subset=['Latitude', 'Longitude'])

# Verifying the result
print(df_event_log_visits[['Latitude', 'Longitude']].isnull().sum())


In [None]:

# Assuming 'First_3_Digits' represents the first 3 digits of postal codes

# List of postal codes corresponding to Conception Bay ('A1X', 'A1W' - CBS , 'A0A', 'A1Y' - CBN)
conception_bay_postal_codes = ['A1X', 'A1W', 'A0A', 'A1Y']

# Filter the dataset for Conception Bay postal codes
cbn_data = df_event_log_visits[df_event_log_visits['First_3_Digits'].isin(conception_bay_postal_codes)]

# Create a cross-tabulation of postal codes with hospital sites
cross_tab = pd.crosstab(cbn_data['First_3_Digits'], cbn_data['FACILITY_NAME'])

# Output the cross-tabulation
cross_tab


## DISTANCE_TO_HOSPITAL

In [None]:
from geopy.distance import geodesic

# Hospital coordinates dictionary (includes all hospitals)
hospital_coords = {
    'Health Sciences Centre - St. Johns': (47.57173844188426, -52.7428752808959),
    'St. Clares Mercy Hospital - St. Johns': (47.55783035337661, -52.72164112844875),
    'Dr. G.B. Cross Memorial Hospital - Clarenville': (48.165186482963676, -53.98449946306605),
    'Carbonear General Hospital - Carbonear': (47.725414948091, -53.226560474732224),
    'Burin Peninsula Health care Centre - Burin': (47.10500591911938, -55.19465935942642),  
}

# Calculating the distance between a case and a hospital
def calculate_distance(row, hospital_name):
    case_coords = (row['Latitude'], row['Longitude'])
    hospital_coords_tuple = hospital_coords.get(hospital_name, None)  
    if hospital_coords_tuple is None:
        return float('nan')  # Return NaN if the hospital isn't in the dictionary
    return geodesic(case_coords, hospital_coords_tuple).km  # Distance in kilometers

# Applying the function to calculate distances for all hospitals
df_event_log_visits['Distance_to_Hospital'] = df_event_log_visits.apply(
    lambda row: calculate_distance(row, row['FACILITY_NAME']),
    axis=1
)

# Displaying DataFrame with the calculated distances
print(df_event_log_visits[['VISIT_ID', 'FACILITY_NAME', 'Distance_to_Hospital']].head())


## IS_NL_HOLIDAY

In [None]:
# List of Newfoundland and Labrador holidays in "MM-DD" format
nl_holidays = {
    "01-01",  # New Year's Day
    "03-17",  # St. Patrick's Day
    "04-23",  # St. George's Day
    "06-24",  # Discovery Day
    "07-01",  # Canada Day
    "09-02",  # Labour Day
    "10-14",  # Thanksgiving
    "11-11",  # Remembrance Day
    "12-25",  # Christmas Day
    "12-26"   # Boxing Day
}

# checking if a given date matches a holiday
def is_nl_holiday(date):
    # Extracting month and day from the date
    month_day = date.strftime("%m-%d")
    return month_day in nl_holidays


In [None]:
# Converting 'Start_Time' to datetime 
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])

# Applying the function to create a new column
df_event_log_visits['Is_NL_Holiday'] = df_event_log_visits['Start_Time'].apply(is_nl_holiday)

# Displaying results
print(df_event_log_visits[['Start_Time', 'Is_NL_Holiday']].head())



## DAY_OF_WEEK and IS_WEEKEND

In [None]:
# Getting the day of the week (0 = Monday, 6 = Sunday)
df_event_log_visits['Day_of_Week'] = pd.to_datetime(df_event_log_visits['Start_Time']).dt.dayofweek
df_event_log_visits['Is_Weekend'] = df_event_log_visits['Day_of_Week'].isin([5, 6])


In [None]:
df_event_log_visits.head()

## VISIT_SEASON

In [None]:
# Defining season ranges
seasons = {
    "Winter": (1, 2, 12),
    "Spring": (3, 4, 5),
    "Summer": (6, 7, 8),
    "Fall": (9, 10, 11)
}

# Function to get the season
def get_season(month):
    for season, months in seasons.items():
        if month in months:
            return season
    return None

df_event_log_visits['Visit_Season'] = df_event_log_visits['Start_Time'].apply(
    lambda x: get_season(pd.to_datetime(x).month)
)


In [None]:
df_event_log_visits.head()

## VISIT_FREQUENCY

In [None]:
# Ensuring 'Start_Time' is in datetime format
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])

# Sorting the data
df_event_log_visits.sort_values(by=['SID', 'Start_Time'], inplace=True)

# Identifying the latest visit for each SID
df_event_log_visits['Latest_Visit'] = df_event_log_visits.groupby('SID')['Start_Time'].transform('max')

# Marking rows that correspond to the latest visit
df_event_log_visits['Is_Latest_Visit'] = df_event_log_visits['Start_Time'] == df_event_log_visits['Latest_Visit']

# Counting all unique visits per SID
visit_counts = df_event_log_visits.groupby('SID')['VISIT_ID'].nunique()

# Subtracting 1 from the counts to exclude the current visit (where there are previous visits)
visit_counts = visit_counts - 1

# Ensuring that no negative counts are present (in cases with only one visit)
visit_counts = visit_counts.clip(lower=0)

# Mapping the adjusted visit counts back to the original DataFrame
df_event_log_visits['Visit_Frequency'] = df_event_log_visits['SID'].map(visit_counts)

# Display the result
print(df_event_log_visits[['SID', 'VISIT_ID', 'Visit_Frequency']].head())


In [None]:
df_event_log_visits.drop(columns=['Latest_Visit', 'Is_Latest_Visit'], inplace=True)


In [None]:
df_event_log_visits.describe()

In [None]:
# 'TLWBS' and 'RLWBS' codes indicate left without being seen
lwbs_codes = ['TLWBS', 'RLWBS']
df_event_log_visits['Is_LWBS'] = df_event_log_visits['DEPART_DISPOSITION_ID'].isin(lwbs_codes).astype(int)



In [None]:

df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])

# Sorting the DataFrame by 'SID' and 'Start_Time'
df_event_log_visits.sort_values(by=['SID', 'Start_Time'], inplace=True)

# Identifying the last visit for each SID
df_event_log_visits['Latest_Visit'] = df_event_log_visits.groupby('SID')['Start_Time'].transform('max')

# Excluding the latest visit from counting
condition = df_event_log_visits['Start_Time'] < df_event_log_visits['Latest_Visit']
df_prior_visits = df_event_log_visits[condition]

# Summing up LWBS occurrences per visit, excluding the latest visit
lwbs_per_visit = df_prior_visits.groupby(['SID', 'VISIT_ID'])['Is_LWBS'].max().groupby('SID').cumsum().reset_index()

# Renaming the column in lwbs_per_visit before merging
lwbs_per_visit.rename(columns={'Is_LWBS': 'Prior_LWBS'}, inplace=True)

# Merging this cumulative count back to the original DataFrame on both SID and VISIT_ID
df_event_log_visits = df_event_log_visits.merge(lwbs_per_visit, on=['SID', 'VISIT_ID'], how='left', suffixes=('', '_cumulative'))

# Replacing NaN with 0 for visits without any prior LWBS events
df_event_log_visits['Prior_LWBS'].fillna(0, inplace=True)

# Dropping the temporary column if no longer needed
df_event_log_visits.drop(columns='Latest_Visit', inplace=True)

# Display the DataFrame to verify results
print(df_event_log_visits[['SID', 'VISIT_ID', 'Prior_LWBS']].head())


In [None]:
# Filtering rows for SID 'STDY111131' and displaying specific columns
filtered_rows = df_event_log_visits[df_event_log_visits['SID'] == 'STDY111131']
print(filtered_rows[['SID', 'VISIT_ID', 'Prior_LWBS', 'DEPART_DISPOSITION_ID']])


In [None]:
df_event_log_visits.columns

In [None]:
df_event_log_visits.describe()

## ED_BUSINESS_HOURLY

In [None]:

# Groupping by hour to get the number of patient arrivals in each hour
arrival_counts = df_event_log_visits.groupby(df_event_log_visits['Start_Time'].dt.floor('H')).size()

# Mappinng the hourly counts back to the original DataFrame
df_event_log_visits['ED_Business_Hourly'] = df_event_log_visits['Start_Time'].dt.floor('H').map(arrival_counts)


print(df_event_log_visits[['Start_Time', 'ED_Business_Hourly']].head())


In [None]:

df_event_log_visits.head()

In [None]:
df_event_log_visits.info()

##  The acuity score for each hour for each site 

In [None]:
# Convert Start_Time and Activity_End_Time to datetime 
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])
df_event_log_visits['Activity_End_Time'] = pd.to_datetime(df_event_log_visits['Activity_End_Time'])


In [None]:
# Calculate Inverse_CTAS
df_event_log_visits['Inverse_CTAS'] = 1 / df_event_log_visits['CTAS']


In [None]:

# Initialize an empty DataFrame to store acuity scores
acuity_scores = pd.DataFrame()

In [None]:


# Iterate through each facility
for facility in df_event_log_visits['FACILITY_ID'].unique():
    # Filter data for the current facility
    facility_data = df_event_log_visits[df_event_log_visits['FACILITY_ID'] == facility]
    
     # Create an hourly index for the time span covered by the facility's data
    # hourly_index: This is a range of timestamps, 
    # starting from start_time and ending at end_time, with a frequency of 1 hour (freq='H').
    # This hourly index includes both the date and time components for each hour in the specified range.

    start_time = facility_data['Start_Time'].min()
    end_time = facility_data['Activity_End_Time'].max()
    hourly_index = pd.date_range(start=start_time, end=end_time, freq='H')
    
    # Initialize a list to hold the active cases per hour
    hourly_acuity = []
    
    # Determine active cases for each hour
    for hour in hourly_index:
        # Filter the data to get active cases at the current hour - A case is considered active if the Start_Time is before or at the hour,
        # and the Activity_End_Time is after or at the hour.
        active_cases = facility_data[(facility_data['Start_Time'] <= hour) & 
                                     (facility_data['Activity_End_Time'] >= hour)]
        # Calculate the mean and std deviation of Inverse_CTAS for active cases
        if not active_cases.empty:
            mean_inverse_ctas = active_cases['Inverse_CTAS'].mean()
            std_inverse_ctas = active_cases['Inverse_CTAS'].std()
            mean_age = active_cases['VISIT_AGE'].mean()
            unique_presenting_complaints = active_cases['PRESENTING_COMPLAINT_ENCODED'].nunique()
        else:
            mean_inverse_ctas = 0
            std_inverse_ctas = 0
            mean_age = 0
            unique_presenting_complaints = 0
        
        # Append the result to the list
        hourly_acuity.append({
            'FACILITY_ID': facility,
            'Hour': hour,
            'Mean_Inverse_CTAS': mean_inverse_ctas,
            'Std_Inverse_CTAS': std_inverse_ctas,
            'Mean_Age': mean_age,
            'Unique_Presenting_Complaints': unique_presenting_complaints
        })
    
   # Convert the list to a DataFrame and append it to the acuity_scores DataFrame
    facility_acuity_df = pd.DataFrame(hourly_acuity)
    acuity_scores = pd.concat([acuity_scores, facility_acuity_df], ignore_index=True)



In [None]:

# Display the first few rows of the acuity_scores DataFrame to verify the calculations
print(acuity_scores.head(20))

In [None]:
df_event_log_visits['Hour'] = df_event_log_visits['Start_Time'].dt.floor('H')


In [None]:
df_event_log_visits['Hour'] = df_event_log_visits['Hour'].dt.floor('H')
acuity_scores['Hour'] = acuity_scores['Hour'].dt.floor('H')


In [None]:


# Merge the acuity scores back into the original dataframe based on FACILITY_ID and Start_Time
merged_df = pd.merge(df_event_log_visits, acuity_scores, how='left', on=['FACILITY_ID', 'Hour'])



merged_df.head()

In [None]:
merged_df.info()

In [None]:
# Dropping the column that is not needed based on my analysis
merged_df.drop(columns=['Hour'], inplace=True)


In [None]:

merged_df.info()

In [None]:
merged_df = merged_df.dropna(subset=['Std_Inverse_CTAS'])
merged_df.info()

In [None]:
df_event_log_visits = merged_df
df_event_log_visits.info()

## IMAGING_DONE and LABS_DONE

In [None]:
# Loading the Excel file for imaging data
imaging_data = pd.read_excel("October2023_Data\INC10447_ED_VISIT_IMAGES_20230929.xlsx")

# Loading the CSV file for lab tests data
labs_data = pd.read_csv("October2023_Data\INC10447_ED_VISIT_LABS_20230929.csv")

# Displaying the first few rows to ensure correct data loading
print("Imaging Data:")
print(imaging_data.head())
print("Labs Data:")
print(labs_data.head())

In [None]:
# Convert the date columns to datetime type
imaging_data['ED_SERVICE_DATE'] = pd.to_datetime(imaging_data['ED_SERVICE_DATE'])
imaging_data['IMAGE_REPORTED_DATE'] = pd.to_datetime(imaging_data['IMAGE_REPORTED_DATE'])
labs_data['ED_SERVICE_DATE'] = pd.to_datetime(labs_data['ED_SERVICE_DATE'])
labs_data['LAB_REPORTED_DATE'] = pd.to_datetime(labs_data['LAB_REPORTED_DATE'])

# Define the date range
start_date = '2022-04-01'
end_date = '2023-04-30'

# Filter the Imaging Data
filtered_imaging_data = imaging_data[
    (imaging_data['ED_SERVICE_DATE'] >= start_date) & 
    (imaging_data['ED_SERVICE_DATE'] <= end_date)
]

# Filter the Labs Data
filtered_labs_data = labs_data[
    (labs_data['ED_SERVICE_DATE'] >= start_date) & 
    (labs_data['ED_SERVICE_DATE'] <= end_date)
]

# Check the filtered data
print("Filtered Imaging Data:", filtered_imaging_data.head())
print("Filtered Labs Data:", filtered_labs_data.head())


In [None]:
filtered_imaging_data.info()

In [None]:
filtered_labs_data.info()

In [None]:
import pandas as pd


# Find unique image test names
unique_images = filtered_imaging_data['IMAGE_TEST_NAME'].unique()

# Find unique lab test names
unique_labs = filtered_labs_data['LAB_TEST_NAME'].unique()

print(f"Unique image test names: {len(unique_images)}")
print(f"Unique lab test names: {len(unique_labs)}")


In [None]:
# Group by visit ID and count the number of tests and images
image_counts = filtered_imaging_data.groupby('ED_VISIT_ID').size()
lab_counts = filtered_labs_data.groupby('ED_VISIT_ID').size()

# Calculate min, max, and average for image tests
image_min = image_counts.min()
image_max = image_counts.max()
image_avg = image_counts.mean()

# Calculate min, max, and average for lab tests
lab_min = lab_counts.min()
lab_max = lab_counts.max()
lab_avg = lab_counts.mean()

print(f"Image tests - Min: {image_min}, Max: {image_max}, Avg: {image_avg:.2f}")
print(f"Lab tests - Min: {lab_min}, Max: {lab_max}, Avg: {lab_avg:.2f}")


In [None]:
# Aggregate Imaging Data by visit
imaging_summary = filtered_imaging_data.groupby(['SID', 'ED_VISIT_ID'])['IMAGE_TEST_NAME'].agg([
    ('Imaging_Tests', lambda x: ', '.join(x.unique())),  # Concatenate unique test names
    ('Num_Imaging_Tests', 'nunique')                     # Count unique tests
]).reset_index()

# Aggregate Labs Data by visit
labs_summary = filtered_labs_data.groupby(['SID', 'ED_VISIT_ID'])['LAB_TEST_NAME'].agg([
    ('Lab_Tests', lambda x: ', '.join(x.unique())),       # Concatenate unique lab names
    ('Num_Lab_Tests', 'nunique')                          # Count unique labs
]).reset_index()

# Merge the summaries with the main dataset
df_event_log_visits = df_event_log_visits.merge(imaging_summary, how='left', left_on=['SID', 'VISIT_ID'], right_on=['SID', 'ED_VISIT_ID'])
df_event_log_visits = df_event_log_visits.merge(labs_summary, how='left', left_on=['SID', 'VISIT_ID'], right_on=['SID', 'ED_VISIT_ID'])

# Clean up the merged DataFrame
df_event_log_visits.drop(columns=['ED_VISIT_ID_x', 'ED_VISIT_ID_y'], inplace=True)

# Replace NaNs in new columns with appropriate defaults
df_event_log_visits[['Imaging_Tests', 'Lab_Tests']].fillna('None', inplace=True)
df_event_log_visits[['Num_Imaging_Tests', 'Num_Lab_Tests']].fillna(0, inplace=True)

# Display to verify
print(df_event_log_visits.head())


In [None]:
df_event_log_visits.info()

In [None]:
# Check in the Imaging Data
imaging_presence = filtered_imaging_data[filtered_imaging_data['ED_VISIT_ID'] == 'VISIT2354278']

# Check in the Labs Data
labs_presence = filtered_labs_data[filtered_labs_data['ED_VISIT_ID'] == 'VISIT2354278']

# Print the results
print("Imaging Data:")
print(imaging_presence)
print("\nLabs Data:")
print(labs_presence)




In [None]:
# Check in the Imaging Data
imaging_presence = filtered_imaging_data[filtered_imaging_data['ED_VISIT_ID'] == 'VISIT2234433']

# Check in the Labs Data
labs_presence = filtered_labs_data[filtered_labs_data['ED_VISIT_ID'] == 'VISIT2234433']

# Print the results
print("Imaging Data:")
print(imaging_presence)
print("\nLabs Data:")
print(labs_presence)




In [None]:
# Filter the DataFrame for the specific VISIT_ID
filtered_data = df_event_log_visits[df_event_log_visits['VISIT_ID'] == 'VISIT2234433']

# Display the filtered DataFrame
filtered_data


In [None]:
# Replace NaN values with 0 in the 'Num_Imaging_Tests' and 'Num_Lab_Tests' columns
df_event_log_visits['Num_Imaging_Tests'].fillna(0, inplace=True)
df_event_log_visits['Num_Lab_Tests'].fillna(0, inplace=True)


In [None]:
df_event_log_visits.head()

In [None]:
df_event_log_visits.info()

In [None]:
# dropping the original columns
df_event_log_visits = df_event_log_visits.drop(columns=['Imaging_Tests', 'Lab_Tests'])


In [None]:
df_event_log_visits.info()

## Number of test/img on daily bases

In [None]:
df_event_log_visits.info()

In [None]:
# Ensure 'Start_Time' is datetime if it's not already
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])

# Extract the date from 'Start_Time' to facilitate grouping
df_event_log_visits['Date'] = df_event_log_visits['Start_Time'].dt.date

# Convert 'Date' to datetime in df_event_log_visits
df_event_log_visits['Date'] = pd.to_datetime(df_event_log_visits['Date'])

# Group by 'FACILITY_ID' and 'Date' to calculate the number of imaging tests and lab tests
daily_tests = df_event_log_visits.groupby(['FACILITY_ID', 'Date']).agg(
    Daily_Imaging_Tests=('Num_Imaging_Tests', 'sum'),
    Daily_Lab_Tests=('Num_Lab_Tests', 'sum')
).reset_index()

# Verify the calculated statistics
print(daily_tests.head())

# Convert 'Date' to datetime in daily_tests
daily_tests['Date'] = pd.to_datetime(daily_tests['Date'])




In [None]:
# Load the original dataset if not already loaded
# df_event_log_visits = pd.read_csv('your_data.csv')  # Uncomment and modify this line to load your actual data

# Ensure 'Start_Time' is datetime if it's not already
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])

# Extract the date from 'Start_Time' to facilitate merging
df_event_log_visits['Date'] = df_event_log_visits['Start_Time'].dt.date

# Convert 'Date' to datetime in df_event_log_visits
df_event_log_visits['Date'] = pd.to_datetime(df_event_log_visits['Date'])

# Convert 'Date' to datetime if it's not already
daily_tests['Date'] = pd.to_datetime(daily_tests['Date'])

# Merge the daily test counts back into the original dataframe based on FACILITY_ID and Date
df_event_log_visits = pd.merge(df_event_log_visits, daily_tests, on=['FACILITY_ID', 'Date'], how='left')

# Drop the 'Date' column from the merge if it's no longer needed
df_event_log_visits = df_event_log_visits.drop(columns=['Date'])

df_event_log_visits.info()

## Deleting columns that were created during the creation of new features and are not needed anymore

In [None]:
# List of columns to drop
columns_to_drop = ['FACILITY_NAME',
    'Country', 
    'Postal_Code', 
    'Place_Name', 
    'Province', 
    'Province_Code', 
    'First_3_Digits',  
    'DOB', 'DOD', 'Latitude', 'Longitude', 'ED_POSTAL_CODE'
]

# Dropping the specified columns
df_event_log_visits.drop(columns=columns_to_drop, inplace=True)

# Displaying the updated DataFrame to confirm the changes
print(df_event_log_visits.head())


In [None]:
df_event_log_visits.info()

In [None]:
# One-Hot Encoding for 'FACILITY_ID'
df_event_log_visits = pd.get_dummies(df_event_log_visits, columns=['FACILITY_ID'])


In [None]:

# Applying one-hot encoding to the 'Activity' column
dummies = pd.get_dummies(df_event_log_visits['Activity'], prefix='Activity')

# Concatenating the original DataFrame with the new dummy variables DataFrame
df_event_log_visits = pd.concat([df_event_log_visits, dummies], axis=1)

# Checking the updated DataFrame
print(df_event_log_visits.head())


In [None]:
# Creating a binary outcome for 'DEPART_DISPOSITION'
df_event_log_visits['Is_LWBS'] = df_event_log_visits['DEPART_DISPOSITION_ID'].isin(['TLWBS', 'RLWBS']).astype(int)


In [None]:
df_event_log_visits.info()

In [None]:
# Dropping original columns after encoding
columns_to_drop = ['Activity']

df_event_log_visits.drop(columns=columns_to_drop, inplace=True)


In [None]:
# Converting 'End_Time' to datetime
df_event_log_visits['Activity_End_Time'] = pd.to_datetime(df_event_log_visits['Activity_End_Time'], errors='coerce')


In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Defining the order for 'TIME_OF_THE_DAY'
time_order = [['Early Morning to Morning', 'Late Morning to Late Afternoon','Evening to Night']]

# Creating an OrdinalEncoder with the specified order
ordinal_encoder = OrdinalEncoder(categories=time_order)

# Applying the encoder to 'TIME_OF_THE_DAY'
df_event_log_visits['TIME_OF_THE_DAY_Ordinal'] = ordinal_encoder.fit_transform(df_event_log_visits[['TIME_OF_THE_DAY']])


In [None]:
columns_to_drop = ['TIME_OF_THE_DAY']

df_event_log_visits.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Defining the order for 'Visit_Season'
season_order = [['Winter', 'Spring', 'Summer', 'Fall']]

# Creating an OrdinalEncoder with the specified order
ordinal_encoder = OrdinalEncoder(categories=season_order)

# Applying the encoder to 'Visit_Season'
df_event_log_visits['Visit_Season_Ordinal'] = ordinal_encoder.fit_transform(df_event_log_visits[['Visit_Season']])


In [None]:
columns_to_drop = ['Visit_Season']

df_event_log_visits.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_event_log_visits.info()

## changes discussed at todays meeting 2024/04/29

In [None]:
# changes discussed at todays meeting 2024/04/29

In [None]:
# Calculating the duration of each activity in hours
df_event_log_visits['Start_Time'] = pd.to_datetime(df_event_log_visits['Start_Time'])
df_event_log_visits['Activity_End_Time'] = pd.to_datetime(df_event_log_visits['Activity_End_Time'])

df_event_log_visits['Activity_Duration'] = (df_event_log_visits['Activity_End_Time'] - df_event_log_visits['Start_Time']).dt.total_seconds() / 3600
print(df_event_log_visits.head())

In [None]:
# Display the first few rows with the correct column names
df_event_log_visits[['VISIT_ID', 'Start_Time', 'Activity_End_Time', 'Activity_Duration']].head(10)


In [None]:
df_event_log_visits.info()

In [None]:
# dropping start and end times as they are no longer needed
columns_to_drop = ['Start_Time', 'Activity_End_Time']

df_event_log_visits.drop(columns=columns_to_drop, inplace=True)

# Renaming the 'Duration_hours' column to 'Case_Duration_Hours'
df_event_log_visits.rename(columns={'Duration_hours': 'Case_Duration_Hours'}, inplace=True)




In [None]:

df_event_log_visits.info()


In [None]:
df_event_log_visits.info()

In [None]:
df_event_log_visits.head()

In [None]:
# Drop rows where 'DEPART_DISPOSITION_ID' is missing
df_event_log_visits.dropna(subset=['DEPART_DISPOSITION_ID'], inplace=True)

# Display the DataFrame to verify that the rows have been removed
print(df_event_log_visits[['SID', 'VISIT_ID', 'DEPART_DISPOSITION_ID']].head())


In [None]:
# Counting occurrences of each 'DEPART_DISPOSITION_ID' within each 'SID'
disposition_counts = df_event_log_visits.groupby(['SID', 'DEPART_DISPOSITION_ID']).size()

# Converting the series to a DataFrame
disposition_counts = disposition_counts.reset_index(name='Counts')

# Calculating total occurrences of dispositions per 'SID'
total_dispositions_per_sid = disposition_counts.groupby('SID')['Counts'].transform('sum')

# Calculating frequency of each disposition within each SID
disposition_counts['Disposition_Frequency'] = disposition_counts['Counts'] / total_dispositions_per_sid

# Merging this frequency back to the original DataFrame
df_event_log_visits = df_event_log_visits.merge(disposition_counts[['SID', 'DEPART_DISPOSITION_ID', 'Disposition_Frequency']], on=['SID', 'DEPART_DISPOSITION_ID'], how='left')

# Displaying the first few rows to verify the changes
df_event_log_visits[['SID', 'DEPART_DISPOSITION_ID', 'Disposition_Frequency']].head()


In [None]:
columns_to_drop = ['DEPART_DISPOSITION_ID']

df_event_log_visits.drop(columns=columns_to_drop, inplace=True)

In [None]:
df_event_log_visits.info()

In [None]:
# Save to CSV
df_event_log_visits.to_csv('df_event_log_visits_for_ML_randomorder_areafix.csv', index=False)
