In [37]:
import pandas as pd
import os

# 1 - Load data

In [2]:
metadataIR3 = pd.read_csv('metaDataIR3.csv')
Drug_categories = pd.read_csv('minimal_DF_03102023.csv') # data corresponding to 1600 patients

# 2 - Merge data

In [3]:
merged_df = metadataIR3.merge(Drug_categories, on = 'PATNO', how = 'left')

# 3 - Descriptive information

## 3.1 - Number of patients with drug metadata

In [10]:
total_unique_drug_categories = merged_df[['Drug_Category','PATNO']].drop_duplicates()['Drug_Category'].value_counts().sum()
# Creating the f-string
result_string = f"The total number of patients with drug metadata is: {total_unique_drug_categories}."
print(result_string)

The total number of patients with drug metadata is: 1592.


## 3.2 - Distribution of patients according to their drug category

In [11]:
merged_df[['Drug_Category','PATNO']].drop_duplicates()['Drug_Category'].value_counts()

Drug_Category
Both                          787
No treatments                 449
Non-disease-modifying only    340
Disease-modifying only         16
Name: count, dtype: int64

## 3.3 - NAIVE PATIENTS

### 3.3.1 - Number of naive patients

In [30]:
naive_patients = len(merged_df[merged_df['Drug_Category'].isin(['No treatments'])]['PATNO'].unique())
naive_string = f"The total number of naive patients: {naive_patients}."
print(naive_string)

The total number of naive patients: 449.


### 3.3.2 - Number of patient samples without drug treatments

In [18]:
number_of_samples = merged_df[merged_df['Drug_Category'] == 'No treatments'].shape[0]
number_string = f"The total number of samples with no drug treatments is: {number_of_samples}."
print(number_string)

The total number of samples with no drug treatments is: 1096.


### 3.3.3 - Distribution of naive patients by 'Case Control' and 'Month' labels 

In [21]:
merged_df[merged_df['Drug_Category'] == 'No treatments'][['Case Control','Month']].value_counts()

Case Control  Month
Control       M0       309
Case          M0       286
              M24      140
Control       M24      107
Case          M12       54
Control       M12       44
Case          M06       41
Control       M06       41
Case          M36       30
Control       M36       18
Other         M0        18
              M24        6
Name: count, dtype: int64

## 3.4 - NAIVE PATIENTS AND WITH NOT-MODIFYING DRUGS INTERVENTIONS

### 3.4.1 - Number of naive patients + Not-modifying interventions

In [31]:
naiveANDnd_patients = len(merged_df[merged_df['Drug_Category'].isin(['No treatments','Non-disease-modifying only'])]['PATNO'].unique())
naiveANDnd_string = f"The total number of naive patients and with non-disease modifying interventions: {naiveANDnd_patients }."
print(naiveANDnd_string)

The total number of naive patients and with non-disease modifying interventions: 789.


### 3.4.2 - Number of naive patients + Not-modifying interventions

In [23]:
number_of_samples_relaxed = merged_df[merged_df['Drug_Category'].isin(['No treatments','Non-disease-modifying only'])].shape[0]
number_string = f"The total number of samples with no drug treatments or with non disease-modifying drugs is: {number_of_samples_relaxed}."
print(number_string)

The total number of samples with no drug treatments or with non disease-modifying drugs is: 5744.


### 3.3.4 - Distribution of naive + non-disease modifying interventions patients by 'Case Control' and 'Month' labels 

In [24]:
merged_df[merged_df['Drug_Category'].isin(['No treatments','Non-disease-modifying only'])][['Case Control','Month']].value_counts()

Case Control  Month
Case          M0       946
Control       M0       710
Case          M24      667
              M12      643
              M06      565
Control       M24      420
              M12      394
              M06      372
Case          M36      346
Control       M36      253
Other         M0       115
              M12       99
              M24       93
              M06       83
              M36       29
Name: count, dtype: int64

# 4 - Export HudAlphaID for data selection

In [39]:
# Define the folder name where you want to store the output
folder_name = 'output-hud_alpha_ids'
# Define the full path for the folder (you can adjust this path as needed)
folder_path = os.path.join(os.getcwd(), folder_name)

In [40]:
# Step 2: Create the folder if it doesn't already exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [41]:
# Assuming you have your DataFrame ready
# Extract the unique values as per your condition
unique_hud_alpha_ids = merged_df[merged_df['Drug_Category'].isin(['No treatments', 'Non-disease-modifying only'])]['HudAlphaID'].unique()

# Convert the array into a DataFrame
unique_ids_df = pd.DataFrame(unique_hud_alpha_ids, columns=['HudAlphaID'])

# Step 3: Generate the file path for the TSV file
file_path = os.path.join(folder_path, 'unique_hud_alpha_ids.tsv')

# Step 4: Export the DataFrame to a TSV file
unique_ids_df.to_csv(file_path, sep='\t', index=False)

print(f"File saved to {file_path}")

File saved to /home/jovyan/work/output-hud_alpha_ids/unique_hud_alpha_ids.tsv
