In [6]:
#Installing & Loading Packages
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from scipy.stats import shapiro
from scipy.stats import ttest_ind
from scipy.stats import ranksums

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [7]:
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

# Or, ignore specific categories of warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [8]:
# Load the patient methylation profiles
m_log1k_path = "/u/home/c/ctang04/HBV_Code/data/data.log1k.txt"
donors_path = "/u/home/c/ctang04/HBV_Code/data/donors.with.samples.txt"

In [9]:
# Read methylation profile
m_log1k_df = pd.read_csv(m_log1k_path, sep='\t', header=0, index_col=0)
#print(m_log1k_df.columns)

# Read donors file
donors_df = pd.read_csv(donors_path, sep='\t', header=0, quotechar='"')

# Remove duplicate samples by donor
unique_donors_df = donors_df.drop_duplicates(subset='donor')

# Get phase classes from the donors
phases = unique_donors_df['phase_HBV'].unique()
print(phases)

# Define mapping between original phases and desired classes
phase_mapping = {
    "Antiviral Rx": "Antiviral Rx",
    "IAH": "IAH",
    "IT": "IT",
    "RP": "RP",
    "RP and Cirrhosis": "Cirrhosis",
    "Antiviral Rx and Cirrh": "Cirrhosis",
    "SC": "SC",
    "ICP": "ICP",
    "IAH and Cirrhosis": "Cirrhosis",
    "SC and Cirrhosis": "Cirrhosis"
}

# Create a new column in the dataframe with the modified classes
unique_donors_df.loc[:, 'modified_phase'] = unique_donors_df['phase_HBV'].map(phase_mapping)

['Antiviral Rx' 'IAH' 'IT' 'RP' 'RP and Cirrhosis'
 'Antiviral Rx and Cirrh' 'SC' 'ICP' 'IAH and Cirrhosis'
 'SC and Cirrhosis']


In [10]:
class_mapping = {
    "IT": "IT",
    "IAH": "IAH",
    "Antiviral Rx": "Other",
    "ICP": "Other",
    "RP": "RP",
    "Cirrhosis": "Other"
}
unique_donors_df.loc[:, 'modified_class'] = unique_donors_df['modified_phase'].map(class_mapping)

In [11]:
# Subset unique_donors_df for the IT class (9)
IT_class_df = unique_donors_df[unique_donors_df['modified_class'] == "IT"]

# Extract sample names
IT_sample_names = IT_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
IT_class_data = m_log1k_df.loc[:, IT_sample_names]

print(IT_class_data.shape)   # Displaying the dimensions (rows, columns)

(144560, 9)


In [12]:
# Subset unique_donors_df for the IAH (67)
IAH_class_df = unique_donors_df[unique_donors_df['modified_class'] == "IAH"]

# Extract sample names
IAH_sample_names = IAH_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
IAH_class_data = m_log1k_df.loc[:, IAH_sample_names]

print(IAH_class_data.shape)   # Displaying the dimensions (rows, columns)

(144560, 67)


In [13]:
# Subset unique_donors_df for the RP (33)
RP_class_df = unique_donors_df[unique_donors_df['modified_class'] == "RP"]

# Extract sample names
RP_sample_names = RP_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
RP_class_data = m_log1k_df.loc[:, RP_sample_names]

print(RP_class_data.shape)   # Displaying the dimensions (rows, columns)

(144560, 33)


In [14]:
# Transpose the DataFrames so that sample names are rows and methylation sites are columns
IT_class_data_t = IT_class_data.T
IAH_class_data_t = IAH_class_data.T
RP_class_data_t = RP_class_data.T

# Add a label column to each transposed DataFrame
IT_class_data_t['label'] = 0  # Label for IT
IAH_class_data_t['label'] = 1  # Label for IAH
RP_class_data_t['label'] = 2  # Label for RP

# Concatenate the transposed DataFrames along the rows (axis=0)
combined_df = pd.concat([IT_class_data_t, IAH_class_data_t, RP_class_data_t])

# Separate features and target
X = combined_df.drop(columns=['label'])
y = combined_df['label']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Print shapes to verify dimensions
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print(y)

X_train shape: (87, 144560)
X_test shape: (22, 144560)
y_train shape: (87,)
y_test shape: (22,)
plasma-646-P9-CH             0
plasma-649-t8-6day-P9-CH     0
plasma-1626-P9-CH            0
plasma-3869-P9-CH            0
plasma-2457-P9-CH            0
                            ..
plasma-3409-P9-CH            2
plasma-2502-P9-CH            2
plasma-2738-P9-CH            2
plasma-2577-5day-P9-CH       2
plasma-2568-r1-4day-P9-CH    2
Name: label, Length: 109, dtype: int64


In [16]:
# Separate the active and inactive groups in the training data
X_train_IT = X_train[y_train == 0]
X_train_IAH = X_train[y_train == 1]
X_train_RP = X_train[y_train == 2]

print(X_train_IT.shape)
print(X_train_IAH.shape)
print(X_train_RP.shape)

(7, 144560)
(54, 144560)
(26, 144560)


In [17]:
#methylation_sites = X_train.columns
#print(methylation_sites.shape)
#print(X_train_active.shape)
#print(X_train_inactive.shape)

In [18]:
X_train_active_IT = X_train_IT
X_train_inactive_Rest = pd.concat([X_train_IAH, X_train_RP])

methylation_sites = X_train.columns

p_values_ITvRest = []
fold_changes_ITvRest = []

for site in methylation_sites:
    active_values = X_train_active_IT[site].values
    inactive_values = X_train_inactive_Rest[site].values

    # Perform Welch's t-test
    t_stat, p_val = ttest_ind(active_values, inactive_values, equal_var=False)
    p_values_ITvRest.append(p_val)

    # Calculate fold change with pseudocount
    pseudocount = 0.001
    mean_active = np.mean(active_values)
    mean_inactive = np.mean(inactive_values)
    
    if mean_inactive != 0:
        fold_change = np.log2((mean_active + pseudocount) / (mean_inactive + pseudocount))
    else:
        fold_change = float('NaN')  # Handle division by zero case

    fold_changes_ITvRest.append(fold_change)

# Create a DataFrame for the results
results_df_ITvRest = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values_ITvRest,
    'fold_change': fold_changes_ITvRest
})

# Filter significant sites based on p-value thresholds
significant_sites05_ITvRest = results_df_ITvRest[results_df_ITvRest['p_value'] < 0.05]
#significant_sites05_ITvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/ITvRest_p05_ttest_fold_change.csv', index=False)
significant_sites01_ITvRest = results_df_ITvRest[results_df_ITvRest['p_value'] < 0.01]
#significant_sites01_ITvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/ITvRest_p01_ttest_fold_change.csv', index=False)

# Print the number of significant sites
print(significant_sites05_ITvRest.shape)
print(significant_sites01_ITvRest.shape)

(16305, 3)
(8535, 3)


In [19]:
X_train_active_IAH = X_train_IAH
X_train_inactive_Rest = pd.concat([X_train_IT, X_train_RP])

methylation_sites = X_train.columns

p_values_IAHvRest = []
fold_changes_IAHvRest = []

for site in methylation_sites:
    active_values = X_train_active_IAH[site].values
    inactive_values = X_train_inactive_Rest[site].values

    # Perform Welch's t-test
    t_stat, p_val = ttest_ind(active_values, inactive_values, equal_var=False)
    p_values_IAHvRest.append(p_val)

    # Calculate fold change with pseudocount
    pseudocount = 0.001
    mean_active = np.mean(active_values)
    mean_inactive = np.mean(inactive_values)
    
    if mean_inactive != 0:
        fold_change = np.log2((mean_active + pseudocount) / (mean_inactive + pseudocount))
    else:
        fold_change = float('NaN')  # Handle division by zero case

    fold_changes_IAHvRest.append(fold_change)

# Create a DataFrame for the results
results_df_IAHvRest = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values_IAHvRest,
    'fold_change': fold_changes_IAHvRest
})

# Filter significant sites based on p-value thresholds
significant_sites05_IAHvRest = results_df_IAHvRest[results_df_IAHvRest['p_value'] < 0.05]
#significant_sites05_IAHvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/IAHvRest_p05_ttest_fold_change.csv', index=False)
significant_sites01_IAHvRest = results_df_IAHvRest[results_df_IAHvRest['p_value'] < 0.01]
#significant_sites01_IAHvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/IAHvRest_p01_ttest_fold_change.csv', index=False)

# Print the number of significant sites
print(significant_sites05_IAHvRest.shape)
print(significant_sites01_IAHvRest.shape)

(5841, 3)
(1001, 3)


In [20]:
X_train_active_RP = X_train_RP
X_train_inactive_Rest = pd.concat([X_train_IT, X_train_IAH])

methylation_sites = X_train.columns

p_values_RPvRest = []
fold_changes_RPvRest = []

for site in methylation_sites:
    active_values = X_train_active_RP[site].values
    inactive_values = X_train_inactive_Rest[site].values

    # Perform Welch's t-test
    t_stat, p_val = ttest_ind(active_values, inactive_values, equal_var=False)
    p_values_RPvRest.append(p_val)

    # Calculate fold change with pseudocount
    pseudocount = 0.001
    mean_active = np.mean(active_values)
    mean_inactive = np.mean(inactive_values)
    
    if mean_inactive != 0:
        fold_change = np.log2((mean_active + pseudocount) / (mean_inactive + pseudocount))
    else:
        fold_change = float('NaN')  # Handle division by zero case

    fold_changes_RPvRest.append(fold_change)

# Create a DataFrame for the results
results_df_RPvRest = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values_RPvRest,
    'fold_change': fold_changes_RPvRest
})

# Filter significant sites based on p-value thresholds
significant_sites05_RPvRest = results_df_RPvRest[results_df_RPvRest['p_value'] < 0.05]
#significant_sites05_RPvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/RPvRest_p05_ttest_fold_change.csv', index=False)
significant_sites01_RPvRest = results_df_RPvRest[results_df_RPvRest['p_value'] < 0.01]
#significant_sites01_RPvRest.to_csv('/u/home/c/ctang04/HBV_Code/output/RPvRest_p01_ttest_fold_change.csv', index=False)

# Print the number of significant sites
print(significant_sites05_RPvRest.shape)
print(significant_sites01_RPvRest.shape)

(4756, 3)
(805, 3)


# Union

In [21]:
union_significant_sites05_1vRest = pd.concat([significant_sites05_ITvRest,significant_sites05_IAHvRest,significant_sites05_RPvRest])
union_significant_sites05_1vRest.drop_duplicates().reset_index(drop=True)
print(union_significant_sites05_1vRest)

union_significant_sites01_1vRest = pd.concat([significant_sites01_ITvRest,significant_sites01_IAHvRest,significant_sites01_RPvRest])
union_significant_sites01_1vRest.drop_duplicates().reset_index(drop=True)
print(union_significant_sites01_1vRest)

                 methylation_site   p_value  fold_change
0       chr10_100027865_100027984  0.016989    -7.359707
13      chr10_100227770_100227889  0.013054    -7.381978
16      chr10_100992139_100992258  0.000111    -8.487823
17      chr10_100992275_100992394  0.000456    -8.335992
18      chr10_100992374_100992493  0.000488    -8.237551
...                           ...       ...          ...
121894     chr7_29846363_29846482  0.047709    -6.804783
122202     chr7_79764580_79764699  0.029163    -7.392649
122693   chr8_145911314_145911433  0.047533          NaN
122720     chr8_17658570_17658689  0.024195          NaN
123445   chr9_132199788_132199907  0.046934          NaN

[26902 rows x 3 columns]
                 methylation_site   p_value  fold_change
16      chr10_100992139_100992258  0.000111    -8.487823
17      chr10_100992275_100992394  0.000456    -8.335992
18      chr10_100992374_100992493  0.000488    -8.237551
23      chr10_100993566_100993685  0.002660    -7.949635
36   

In [22]:
fold_change_threshold = 2
significant_sites_05_fold_change = union_significant_sites05_1vRest[union_significant_sites05_1vRest['fold_change'] > fold_change_threshold]
print(significant_sites_05_fold_change)

significant_sites_05 = significant_sites_05_fold_change['methylation_site'].tolist()

# Select significant features from the training and testing data
X_train_significant = X_train[significant_sites_05]
X_test_significant = X_test[significant_sites_05]


print(X_train_significant.shape)
print(X_test_significant.shape)
print(y_train.shape)
print(y_test.shape)


                 methylation_site   p_value  fold_change
9689      chr11_69634629_69634748  0.016766     2.779214
10941   chr12_108238515_108238634  0.003203     2.437523
11385   chr12_114877345_114877464  0.003504     2.344901
15769   chr13_109148900_109149019  0.024788     2.195102
15983   chr13_112758417_112758536  0.023857     2.442189
...                           ...       ...          ...
116112   chr2_219867615_219867734  0.036380     3.857768
118249     chr3_98451598_98451717  0.017288     4.394382
118250     chr3_98451691_98451810  0.006955     3.805465
119636     chr5_31855140_31855259  0.002300     4.846784
121515   chr7_132260465_132260584  0.021858     2.702360

[923 rows x 3 columns]
(87, 923)
(22, 923)
(87,)
(22,)


In [23]:
from sklearn.preprocessing import label_binarize
rf_model = RandomForestClassifier(
    max_depth=None,
    max_features='log2',
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=200
)
# Fit the model on the training data
rf_model.fit(X_train_significant, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test_significant)

# Predict probabilities for AUC calculation
y_pred_prob = rf_model.predict_proba(X_test_significant)

# Check if the classification is binary or multiclass
if len(set(y_train)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])
else:  # Multiclass classification
    y_test_bin = label_binarize(y_test, classes=list(set(y_train)))
    auc = roc_auc_score(y_test_bin, y_pred_prob, multi_class='ovr')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"AUC-ROC: {auc}")

Accuracy: 0.6363636363636364
Confusion Matrix:
[[ 0  2  0]
 [ 0 12  1]
 [ 0  5  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.63      0.92      0.75        13
           2       0.67      0.29      0.40         7

    accuracy                           0.64        22
   macro avg       0.43      0.40      0.38        22
weighted avg       0.59      0.64      0.57        22

AUC-ROC: 0.5769434269434269


In [24]:
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, confusion_matrix
from sklearn.preprocessing import label_binarize

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Define the scorer for AUC
auc_scorer = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')

# Set up Grid Search with cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           scoring=auc_scorer,
                           cv=5,  # 5-fold cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=0)

# Fit the Grid Search to the training data
grid_search.fit(X_train_significant, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

print(f"Best parameters: {best_params}")

# Predict on the test data
y_pred = best_rf_model.predict(X_test_significant)
y_pred_prob = best_rf_model.predict_proba(X_test_significant)

# Calculate AUC
if len(set(y_train)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])
else:  # Multiclass classification
    y_test_bin = label_binarize(y_test, classes=list(set(y_train)))
    auc = roc_auc_score(y_test_bin, y_pred_prob, multi_class='ovr')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Calculate sensitivity (recall) for each class
sensitivity = np.diag(cm) / np.sum(cm, axis=1)

# Calculate specificity for each class
specificity = []

for i in range(cm.shape[0]):
    # True negatives (TN) for class i
    TN = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
    # False positives (FP) and true negatives (TN) for class i
    FP_plus_TN = np.sum(cm) - np.sum(cm[:, i])
    # Specificity for class i
    if FP_plus_TN == 0:
        specificity_i = 0
    else:
        specificity_i = TN / FP_plus_TN
    specificity.append(specificity_i)

# Print results for sensitivity and specificity
for i in range(cm.shape[0]):
    print(f'Class {i}: Sensitivity (Recall) = {sensitivity[i]:.2f}, Specificity = {specificity[i]:.2f}')

# Print evaluation results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"ROC AUC: {auc}")
print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")

Best parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
[[ 0  2  0]
 [ 0 12  1]
 [ 0  5  2]]
Class 0: Sensitivity (Recall) = 0.00, Specificity = 0.91
Class 1: Sensitivity (Recall) = 0.92, Specificity = 0.67
Class 2: Sensitivity (Recall) = 0.29, Specificity = 0.74
Accuracy: 0.6363636363636364
Confusion Matrix:
[[ 0  2  0]
 [ 0 12  1]
 [ 0  5  2]]
ROC AUC: 0.6068477818477819
Sensitivity: [0.         0.92307692 0.28571429]
Specificity: [0.9090909090909091, 0.6666666666666666, 0.7368421052631579]


In [26]:
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='threadpoolctl')
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_significant, y_train)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the SMOTE-resampled training data
rf_model.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred = rf_model.predict(X_test_significant)

# Predict probabilities for AUC calculation
y_pred_prob = rf_model.predict_proba(X_test_significant)

# Check if the classification is binary or multiclass
if len(set(y_train)) == 2:  # Binary classification
    auc = roc_auc_score(y_test, y_pred_prob[:, 1])
else:  # Multiclass classification
    y_test_bin = label_binarize(y_test, classes=list(set(y_train)))
    auc = roc_auc_score(y_test_bin, y_pred_prob, multi_class='ovr')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"AUC-ROC: {auc}")

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x2ab5c15abf70>
Traceback (most recent call last):
  File "/u/local/apps/python/3.9.6/gcc-4.8.5/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/u/local/apps/python/3.9.6/gcc-4.8.5/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/u/local/apps/python/3.9.6/gcc-4.8.5/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/u/local/apps/python/3.9.6/gcc-4.8.5/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _Thread

Accuracy: 0.5909090909090909
Confusion Matrix:
[[ 0  2  0]
 [ 0 11  2]
 [ 0  5  2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.61      0.85      0.71        13
           2       0.50      0.29      0.36         7

    accuracy                           0.59        22
   macro avg       0.37      0.38      0.36        22
weighted avg       0.52      0.59      0.54        22

AUC-ROC: 0.556496743996744
