In [14]:
import pandas as pd

# Read positive and negative datasets
positive_data = pd.read_csv('/datasets/eteplygina/pos_301.csv')
negative_data = pd.read_csv('/datasets/eteplygina/neg_301.csv')

# Function to filter data with relative time > -4 (or adjust as needed)
def filter_positive(data):
    return data[data['relative_time'] >= -4]

# Apply filter to positive dataset
positive_data_filtered = positive_data.groupby('id', group_keys=False).apply(filter_positive).reset_index(drop=True)

# Uncomment and adjust the function for negative data filtering if needed
# def filter_negative(data):
#     return data[data['relative_time'] <= 12]

# Apply filter to negative dataset
# negative_data_filtered = negative_data.groupby('id', group_keys=False).apply(filter_negative).reset_index(drop=True)

all_positive_ids = positive_data['id'].unique()
filtered_positive_ids = positive_data_filtered['id'].unique()

# Find IDs not included after filtering
not_included_ids = set(all_positive_ids) - set(filtered_positive_ids)
print("Not included positive IDs:", not_included_ids)

# Print filtered positive data
print("Filtered Positive Data:")
print(positive_data_filtered.head())
print("\n")

# Print filtered negative data if it was filtered
# print("Filtered Negative Data:")
# print(negative_data_filtered.head())
print("\n")

print("Filtered Positive IDs:", positive_data_filtered['id'].nunique())
# Uncomment the line below if filtering for negative data
# print("Filtered Negative IDs:", negative_data_filtered['id'].nunique())


Not included positive IDs: set()
Filtered Positive Data:
         id  weight  height   age         time_rounded  relative_time  \
0  30004144    80.0    -1.0  82.0  2126-04-04 13:00:00           11.5   
1  30004144    80.0    -1.0  82.0  2126-04-04 13:30:00           11.0   
2  30004144    80.0    -1.0  82.0  2126-04-04 14:00:00           10.5   
3  30004144    80.0    -1.0  82.0  2126-04-04 14:30:00           10.0   
4  30004144    80.0    -1.0  82.0  2126-04-04 15:00:00            9.5   

  ethnicity gender           shock_time  label  heartrate         sbp  \
0     WHITE      M  2126-04-05 00:30:00      1  75.000000  146.000000   
1     WHITE      M  2126-04-05 00:30:00      1  62.666667  140.333333   
2     WHITE      M  2126-04-05 00:30:00      1  74.666667  143.666667   
3     WHITE      M  2126-04-05 00:30:00      1  66.500000  138.500000   
4     WHITE      M  2126-04-05 00:30:00      1  66.000000  145.000000   

         dbp        mbp  respiration  temperature       spo2  
0 

In [15]:
import pandas as pd
import numpy as np

combined_data = []

# Get unique positive IDs and their max relative times
positive_ids = positive_data_filtered['id'].unique()
total_positive_patients = len(positive_ids)
positive_max_times = positive_data_filtered.groupby('id')['relative_time'].max().to_dict()

# Get unique negative IDs and their max relative times
neg_pool = negative_data['id'].unique()
negative_max_times = negative_data.groupby('id')['relative_time'].max().to_dict()

print("Filtered Negative IDs:", len(neg_pool))

for pos_index, pos_id in enumerate(positive_ids, 1):
    print(f"Processing positive patient {pos_index}/{total_positive_patients} (ID: {pos_id})")
    # Get all entries for the current positive ID
    pos_patient_data = positive_data_filtered[positive_data_filtered['id'] == pos_id]
    pos_max_time = positive_max_times[pos_id] + 4
    combined_data.append(pos_patient_data)

    # Find suitable negative samples
    suitable_neg_ids = [neg_id for neg_id in neg_pool if negative_max_times[neg_id] >= pos_max_time]
    
    if len(suitable_neg_ids) >= 4:
        selected_neg_ids = np.random.choice(suitable_neg_ids, 4, replace=False)
    else:
        # In case there are fewer than 4 suitable negatives, take all available
        selected_neg_ids = suitable_neg_ids

    for neg_id in selected_neg_ids:
        print(f"Processing negative patient {neg_id}")
        # Get all entries for the current negative ID
        neg_patient_data = negative_data[negative_data['id'] == neg_id]
        # Cut the data to match the max value of the positive patient
        neg_patient_data = neg_patient_data[neg_patient_data['relative_time'] <= pos_max_time]
        combined_data.append(neg_patient_data)

    # Remove selected negative IDs from the pool
    neg_pool = np.setdiff1d(neg_pool, selected_neg_ids)
    print("Filtered Negative IDs:", len(neg_pool))

# Concatenate the dataframes in combined_data list into a single dataframe
combined_data = pd.concat(combined_data)

# Save the combined data
combined_data.to_csv('/datasets/eteplygina/combined_data.csv', index=False)


Filtered Negative IDs: 13859
Processing positive patient 1/3187 (ID: 30004144)
Processing negative patient 31735884
Processing negative patient 33881484
Processing negative patient 37288233
Processing negative patient 39812212
Filtered Negative IDs: 13855
Processing positive patient 2/3187 (ID: 30005366)
Processing negative patient 39503239
Processing negative patient 34833293
Processing negative patient 32308172
Processing negative patient 35502578
Filtered Negative IDs: 13851
Processing positive patient 3/3187 (ID: 30009505)
Processing negative patient 38288748
Processing negative patient 33811420
Processing negative patient 38159053
Processing negative patient 37722190
Filtered Negative IDs: 13847
Processing positive patient 4/3187 (ID: 30015010)
Processing negative patient 32858566
Processing negative patient 33882618
Processing negative patient 34134549
Processing negative patient 31711822
Filtered Negative IDs: 13843
Processing positive patient 5/3187 (ID: 30016432)
Processing ne

In [17]:
import pandas as pd

# Assuming combined_data is already defined as a pandas DataFrame from the previous code
# Group by 'id' and count the number of rows for each 'id'
id_counts = combined_data.groupby('id').size()

# Find the 'id' with the maximum count
max_id = id_counts.idxmin()
max_count = id_counts.min()

print(f"The ID with the longest length is {max_id} with {max_count} entries.")

The ID with the longest length is 30262537 with 16 entries.


In [18]:
import pandas as pd

# Read the data
data = pd.read_csv('/datasets/eteplygina/combined_data.csv')

# Define ranges
ranges = {
    'heartrate': (0.0, 300.0),
    'sbp': (10.0, 300.0),
    'dbp': (10.0, 175.0),
    'mbp': (10.0, 200.0),
    'respiration': (0.0, 45.0),
    'temperature': (25.0, 45.0),
    'spo2': (10.0, 100.0)
}

# Filter data based on ranges
filtered_data = data[(data['heartrate'].between(*ranges['heartrate'])) &
                     (data['sbp'].between(*ranges['sbp'])) &
                     (data['dbp'].between(*ranges['dbp'])) &
                     (data['mbp'].between(*ranges['mbp'])) &
                     (data['respiration'].between(*ranges['respiration'])) &
                     (data['temperature'].between(*ranges['temperature'])) &
                     (data['spo2'].between(*ranges['spo2']))]

# Find row numbers of the first two entries that are out of range
out_of_range_indices = data.index.difference(filtered_data.index)[:2]

# Print the row numbers and corresponding rows
for index in out_of_range_indices:
    print("Row number:", index)
    print(data.loc[index])

print("Original rows count:", len(data))
print("Filtered rows count:", len(filtered_data))

# Save the filtered DataFrame to a new CSV file
filtered_data.to_csv('/datasets/eteplygina/combined_data.csv', index=False)

Row number: 7032
id                          38234763
weight                          66.8
height                         155.0
age                             80.0
time_rounded     2170-07-10 12:00:00
relative_time                   45.0
ethnicity            ASIAN - CHINESE
gender                             M
shock_time                       NaN
label                              0
heartrate                       75.0
sbp                            219.0
dbp                            187.0
mbp                            199.0
respiration                     19.0
temperature                    37.56
spo2                            98.0
Name: 7032, dtype: object
Row number: 7033
id                          38234763
weight                          66.8
height                         155.0
age                             80.0
time_rounded     2170-07-10 12:30:00
relative_time                   45.5
ethnicity            ASIAN - CHINESE
gender                             M
shock_time     