In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import shapiro
from scipy.stats import ttest_ind
from scipy.stats import ranksums

In [3]:
# Load the patient methylation profiles
m_log1k_path = "/u/home/c/ctang04/HBV_Code/data/data.log1k.txt"
donors_path = "/u/home/c/ctang04/HBV_Code/data/donors.with.samples.txt"

In [9]:
# Read methylation profile
m_log1k_df = pd.read_csv(m_log1k_path, sep='\t', header=0, index_col=0)
#print(m_log1k_df.columns)

# Read donors file
donors_df = pd.read_csv(donors_path, sep='\t', header=0, quotechar='"')

# Remove duplicate samples by donor
unique_donors_df = donors_df.drop_duplicates(subset='donor')

# Get phase classes from the donors
phases = unique_donors_df['phase_HBV'].unique()
print(phases)

# Define mapping between original phases and desired classes
phase_mapping = {
    "Antiviral Rx": "Antiviral Rx",
    "IAH": "IAH",
    "IT": "IT",
    "RP": "RP",
    "RP and Cirrhosis": "Cirrhosis",
    "Antiviral Rx and Cirrh": "Cirrhosis",
    "SC": "SC",
    "ICP": "ICP",
    "IAH and Cirrhosis": "Cirrhosis",
    "SC and Cirrhosis": "Cirrhosis"
}

# Create a new column in your dataframe with the modified classes
unique_donors_df.loc[:, 'modified_phase'] = unique_donors_df['phase_HBV'].map(phase_mapping)

['Antiviral Rx' 'IAH' 'IT' 'RP' 'RP and Cirrhosis'
 'Antiviral Rx and Cirrh' 'SC' 'ICP' 'IAH and Cirrhosis'
 'SC and Cirrhosis']


In [10]:
subset_df = unique_donors_df[unique_donors_df['modified_phase'].isin(['IAH', 'RP', 'ICP'])]

# Display the subsetted DataFrame
print(subset_df)

    donor                     sample baseline_visit1 phase_HBV etiology  \
2    C010           plasma-641-P9-CH         7/11/19       IAH      HBV   
5    C011           plasma-642-P9-CH         7/18/19       IAH      HBV   
6    C013           plasma-644-P9-CH         7/25/19       IAH      HBV   
17   C029           plasma-660-P9-CH         9/19/19       IAH      HBV   
19   C032          plasma-2097-P9-CH         9/20/19        RP      HBV   
..    ...                        ...             ...       ...      ...   
242  C502          plasma-2738-P9-CH         8/25/22        RP      HBV   
243  C503  plasma-2573-t2-5day-P9-CH         8/25/22       IAH      HBV   
244  C505          plasma-2536-P9-CH          9/1/22       IAH      HBV   
250  C577     plasma-2577-5day-P9-CH         6/15/23        RP      HBV   
251  C582  plasma-2568-r1-4day-P9-CH         7/20/23        RP      HBV   

    modified_phase  
2              IAH  
5              IAH  
6              IAH  
17             

In [14]:
class_mapping = {
    "IAH": "Inactive",
    "ICP": "Active",
    "RP": "Active"
}
subset_df.loc[:,'modified_class'] = subset_df['modified_phase'].map(class_mapping)

In [16]:
# Subset unique_donors_df for the Active Classes (RP and ICP, 33 and 4 respectively)
Active_class_df = subset_df[subset_df['modified_class'] == "Active"]

# Extract sample names
Active_sample_names = Active_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
Active_class_data = m_log1k_df.loc[:, Active_sample_names]

#print(Active_class_data.head())  # Displaying the first few rows
print(Active_class_data.shape)   # Displaying the dimensions (rows, columns)

(144560, 37)


In [18]:
# Subset unique_donors_df for the Inactive Classes (IAH -- 67)
Inactive_class_df = subset_df[subset_df['modified_class'] == "Inactive"]

# Extract sample names
Inactive_sample_names = Inactive_class_df['sample'].tolist()

# Subset m_log1k_df based on Inactive_sample_names
Inactive_class_data = m_log1k_df.loc[:, Inactive_sample_names]

#print(Active_class_data.head())  # Displaying the first few rows
print(Inactive_class_data.shape)   # Displaying the dimensions (rows, columns) -- should have 134

(144560, 67)


In [27]:
# Subset unique_donors_df for the Active Classes (RP and ICP, 33 and 4 respectively)
Active_class_df = subset_df[subset_df['modified_class'] == "Active"]

# Extract sample names
Active_sample_names = Active_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
Active_class_data = m_log1k_df[Active_sample_names]

print(Active_class_data.shape)  # Displaying the dimensions

# Subset unique_donors_df for the Inactive Classes (IAH -- 67)
Inactive_class_df = subset_df[subset_df['modified_class'] == "Inactive"]

# Extract sample names
Inactive_sample_names = Inactive_class_df['sample'].tolist()

# Subset m_log1k_df based on Inactive_sample_names
Inactive_class_data = m_log1k_df[Inactive_sample_names]

print(Inactive_class_data.shape)  # Displaying the dimensions
methylation_sites = m_log1k_df.index

(144560, 37)
(144560, 67)


In [28]:
p_values = []
# Perform t-test for each methylation site
for site in methylation_sites:
    active_values = Active_class_data.loc[site, :].values
    inactive_values = Inactive_class_data.loc[site, :].values
    
    # Convert to numeric, coerce errors to NaN, and drop NaN values
    active_values = pd.to_numeric(active_values, errors='coerce')
    inactive_values = pd.to_numeric(inactive_values, errors='coerce')
    active_values = active_values[~pd.isnull(active_values)]
    inactive_values = inactive_values[~pd.isnull(inactive_values)]
    
    # Perform Welch's t-test
    t_stat, p_val = ttest_ind(active_values, inactive_values, equal_var=False)
    p_values.append(p_val)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values
})

In [30]:
# Extract significantly different methylation sites (adjusted p-value < 0.05)
print(results_df)
significant_sites = results_df[results_df['p_value'] < 0.05]

# Print the significant sites
print(significant_sites)
significant_sites.to_csv('/u/home/c/ctang04/HBV_Code/output/IAH_vs_ICPandRP_methyl_p0.05.csv', index=False)

                 methylation_site   p_value
0       chr10_100027865_100027984  0.177184
1       chr10_100028045_100028164  0.167805
2       chr10_100028161_100028280       NaN
3       chr10_100028371_100028490  0.046915
4       chr10_100069285_100069404  0.584461
...                           ...       ...
144555     chr9_88556059_88556178       NaN
144556     chr9_95087294_95087413       NaN
144557     chrX_24168447_24168566       NaN
144558     chrX_24168513_24168632       NaN
144559     chrX_24168649_24168768       NaN

[144560 rows x 2 columns]
                 methylation_site   p_value
3       chr10_100028371_100028490  0.046915
13      chr10_100227770_100227889  0.032394
16      chr10_100992139_100992258  0.047189
31      chr10_100995793_100995912  0.010123
37      chr10_101089441_101089560  0.017275
...                           ...       ...
122693   chr8_145911314_145911433  0.047373
122720     chr8_17658570_17658689  0.013392
123445   chr9_132199788_132199907  0.047705
12359

In [39]:
# Extract the sample names from the appropriate column
sample_names = subset_df['sample'].tolist()

# Filter the methylation data to include only the samples in subset_df
filtered_methylation_data = m_log1k_df[sample_names]

# Ensure the methylation data has the same methylation sites as the p-values results
filtered_methylation_data = filtered_methylation_data.loc[methylation_sites]

# Create a dictionary to map sample names to their corresponding modified_class
sample_to_class_mapping = subset_df.set_index('sample')['modified_class'].to_dict()

# Map the sample names to their corresponding classes
class_labels = [sample_to_class_mapping[sample] for sample in sample_names]

# Create a DataFrame for the heatmap
heatmap_data = filtered_methylation_data.T
heatmap_data['class'] = class_labels

# Sort the DataFrame by the class column
heatmap_data = heatmap_data.sort_values('class')

# Drop the class column for heatmap plotting
heatmap_data_matrix = heatmap_data.drop(columns=['class'])

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(heatmap_data_matrix, cmap='viridis', yticklabels=False)

# Add labels for the classes
class_positions = heatmap_data.index.to_series().groupby(heatmap_data['class']).first().values
for class_label, position in zip(heatmap_data['class'].unique(), class_positions):
    plt.text(-0.5, position, class_label, rotation=0, verticalalignment='center', horizontalalignment='right', fontsize=12)

plt.title('Heatmap of Methylation Markers for ICP, RP, and IAH Groups')
plt.xlabel('Methylation Sites')
plt.ylabel('')  #we do not want the sample names showing up
plt.show()

ConversionError: Failed to convert value(s) to axis units: 'plasma-1438-P9-CH'

<Figure size 720x576 with 2 Axes>