In [61]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from scipy.stats import shapiro
from scipy.stats import ttest_ind
from scipy.stats import ranksums

In [62]:
# Load the patient methylation profiles
m_log1k_path = "/u/home/c/ctang04/HBV_Code/data/data.log1k.txt"
donors_path = "/u/home/c/ctang04/HBV_Code/data/donors.with.samples.txt"
#output_csv_path = "/u/home/c/ctang04/HBV_Code/data/data/unique_donors.csv"

In [65]:
# Read methylation profile
m_log1k_df = pd.read_csv(m_log1k_path, sep='\t', header=0, index_col=0)
#print(m_log1k_df.columns)

# Read donors file
donors_df = pd.read_csv(donors_path, sep='\t', header=0, quotechar='"')

# Remove duplicate samples by donor
unique_donors_df = donors_df.drop_duplicates(subset='donor')

# Get phase classes from the donors
phases = unique_donors_df['phase_HBV'].unique()
print(phases)

# Define mapping between original phases and desired classes
phase_mapping = {
    "Antiviral Rx": "Antiviral Rx",
    "IAH": "IAH",
    "IT": "IT",
    "RP": "RP",
    "RP and Cirrhosis": "Cirrhosis",
    "Antiviral Rx and Cirrh": "Cirrhosis",
    "SC": "SC",
    "ICP": "ICP",
    "IAH and Cirrhosis": "Cirrhosis",
    "SC and Cirrhosis": "Cirrhosis"
}

# Create a new column in your dataframe with the modified classes
unique_donors_df.loc[:, 'modified_phase'] = unique_donors_df['phase_HBV'].map(phase_mapping)

['Antiviral Rx' 'IAH' 'IT' 'RP' 'RP and Cirrhosis'
 'Antiviral Rx and Cirrh' 'SC' 'ICP' 'IAH and Cirrhosis'
 'SC and Cirrhosis']


In [66]:
class_mapping = {
    "IT": "Inactive",
    "IAH": "Inactive",
    "Antiviral Rx": "Inactive",
    "ICP": "Active",
    "RP": "Active",
    "Cirrhosis": "Cirrhosis"
}
unique_donors_df.loc[:, 'modified_class'] = unique_donors_df['modified_phase'].map(class_mapping)

In [67]:
# Subset unique_donors_df for the Active Classes (RP and ICP, 33 and 4 respectively)
Active_class_df = unique_donors_df[unique_donors_df['modified_class'] == "Active"]

# Extract sample names
Active_sample_names = Active_class_df['sample'].tolist()

# Subset m_log1k_df based on Active_sample_names
Active_class_data = m_log1k_df.loc[:, Active_sample_names]

#print(Active_class_data.head())  # Displaying the first few rows
print(Active_class_data.shape)   # Displaying the dimensions (rows, columns)

(144560, 37)


In [68]:
# Subset unique_donors_df for the Inactive Classes (IT, IAH, Antiviral Rx || 9, 67, 58 respectively)
Inactive_class_df = unique_donors_df[unique_donors_df['modified_class'] == "Inactive"]

# Extract sample names
Inactive_sample_names = Inactive_class_df['sample'].tolist()

# Subset m_log1k_df based on Inactive_sample_names
Inactive_class_data = m_log1k_df.loc[:, Inactive_sample_names]

#print(Active_class_data.head())  # Displaying the first few rows
print(Inactive_class_data.shape)   # Displaying the dimensions (rows, columns) -- should have 134

(144560, 134)


In [69]:
# Subset unique_donors_df for the Cirrhosis & HCC 11 Cirrhosis patients (6 of these with HCC)
Cirr_class_df = unique_donors_df[unique_donors_df['modified_class'] == "Cirrhosis"]

# Extract sample names
Cirr_sample_names = Cirr_class_df['sample'].tolist()

# Subset m_log1k_df based on Inactive_sample_names
Cirr_class_data = m_log1k_df.loc[:, Cirr_sample_names]

#print(Active_class_data.head())  # Displaying the first few rows
print(Cirr_class_data.shape)   # Displaying the dimensions (rows, columns) -- should have 134

(144560, 11)


In [10]:
alpha = 0.05
stat, p = shapiro(Active_class_data)
if p > alpha:
    print(f'p-value = {p:.4f}. Sample looks Gaussian (fail to reject H0)')
else:
    print(f'p-value = {p:.4f}. Sample does not look Gaussian (reject H0)')
stat, p = shapiro(Inactive_class_data)
if p > alpha:
    print(f'p-value = {p:.4f}. Sample looks Gaussian (fail to reject H0)')
else:
    print(f'p-value = {p:.4f}. Sample does not look Gaussian (reject H0)')
stat, p = shapiro(Cirr_class_data)
if p > alpha:
    print(f'p-value = {p:.4f}. Sample looks Gaussian (fail to reject H0)')
else:
    print(f'p-value = {p:.4f}. Sample does not look Gaussian (reject H0)')



p-value = 0.0000. Sample does not look Gaussian (reject H0)
p-value = 0.0000. Sample does not look Gaussian (reject H0)
p-value = 0.0000. Sample does not look Gaussian (reject H0)


In [70]:
# Transpose the DataFrames so that sample names are rows and methylation sites are columns
Active_class_data_t = Active_class_data.T
Inactive_class_data_t = Inactive_class_data.T

# Add a label column to each transposed DataFrame
Active_class_data_t['label'] = 1  # Adding a label column for Active class
Inactive_class_data_t['label'] = 0  # Adding a label column for Inactive class

# Concatenate the transposed DataFrames along the rows (axis=0)
combined_df = pd.concat([Active_class_data_t, Inactive_class_data_t])

# Separate features and target
X = combined_df.drop(columns=['label'])
y = combined_df['label']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [72]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to verify dimensions
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print(y)

X_train shape: (136, 144560)
X_test shape: (35, 144560)
y_train shape: (136,)
y_test shape: (35,)
plasma-2097-P9-CH            1
plasma-2569-t7-5day-P9-CH    1
plasma-3967-P9-CH            1
plasma-1327-P9-CH            1
plasma-1423-P9-CH            1
                            ..
plasma-2536-P9-CH            0
plasma-2368-P9-CH            0
plasma-3964-P9-CH            0
plasma-2575-5day-P9-CH       0
plasma-2576-5day-P9-CH       0
Name: label, Length: 171, dtype: int64


In [73]:
# Separate the active and inactive groups in the training data
X_train_active = X_train[y_train == 1]
X_train_inactive = X_train[y_train == 0]

print(X_train_active.shape)
print(X_train_inactive.shape)

(27, 144560)
(109, 144560)


In [86]:
# Perform the t-test for each methylation site
p_values = []
methylation_sites = X_train.columns

for site in methylation_sites:
    active_values = X_train_active[site].values
    inactive_values = X_train_inactive[site].values
    t_stat, p_val = ttest_ind(active_values, inactive_values, equal_var=False)  # Welch's t-test
    p_values.append(p_val)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values
})

In [19]:
p_values_w = []
methylation_sites = X_train.columns

for site in methylation_sites:
    active_values = X_train_active[site].values
    inactive_values = X_train_inactive[site].values
    stat, p_val = ranksums(active_values, inactive_values)  # Wilcoxon rank-sum test
    p_values_w.append(p_val)

# Create a DataFrame for the results
results_df_w = pd.DataFrame({
    'methylation_site': methylation_sites,
    'p_value': p_values_w
})

significant_sites_w = results_df_w[results_df_w['p_value'] < 0.05]
significant_sites_w.to_csv('/u/home/c/ctang04/HBV_Code/output/inactive_vs_active_methyl_p0.05_wilcoxon.csv', index=False)
# Print the significant sites
print(significant_sites_w)

                 methylation_site   p_value
110     chr10_102107662_102107781  0.043820
129     chr10_102322098_102322217  0.046153
130     chr10_102322550_102322669  0.014744
186     chr10_102484200_102484319  0.030116
336     chr10_102821462_102821581  0.045560
...                           ...       ...
101353     chrX_40026871_40026990  0.022746
101359     chrX_40027869_40027988  0.005918
101453     chrX_45617999_45618118  0.014855
101720     chrX_90689903_90690022  0.033141
101747       chrX_9434296_9434415  0.030116

[2086 rows x 2 columns]


In [19]:
print(len(p_values))

144560


In [87]:
#Possibly apply multiple testing correction (e.g., Bonferroni or FDR)
#results_df['adjusted_p_value'] = results_df['p_value'] * len(methylation_sites)  # Bonferroni correction
#results_df['adjusted_p_value'] = np.minimum(results_df['adjusted_p_value'], 1)  # Adjust values to be at most 1

# Extract significantly different methylation sites (adjusted p-value < 0.05)
significant_sites = results_df[results_df['p_value'] < 0.05]

# Print the significant sites
print(significant_sites)

# Save the significant sites to a CSV file
#significant_sites.to_csv('/u/home/c/ctang04/HBV_Code/output/inactive_vs_active_methyl_p0.01.csv', index=False)

                 methylation_site   p_value
13      chr10_100227770_100227889  0.002906
24      chr10_100993746_100993865  0.026571
31      chr10_100995793_100995912  0.027544
42      chr10_101090100_101090219  0.041004
128     chr10_102321942_102322061  0.000522
...                           ...       ...
123362   chr9_130266355_130266474  0.025751
123938       chr9_6759253_6759372  0.026527
124049     chr9_91193464_91193583  0.048630
124159       chrX_2816279_2816398  0.046278
130101     chr1_85156679_85156798  0.027604

[5480 rows x 2 columns]


In [88]:
significant_sites_05 = significant_sites['methylation_site'].tolist()

# Select significant features from the training and testing data
X_train_significant = X_train[significant_sites_05]
X_test_significant = X_test[significant_sites_05]

"""significant_sites_05 = significant_sites_w['methylation_site'].tolist()

# Select significant features from the training and testing data
X_train_significant = X_train[significant_sites_05]
X_test_significant = X_test[significant_sites_05]"""

print(X_train_significant.shape)
print(X_test_significant.shape)
print(y_train.shape)
print(y_test.shape)


#significant_sites_01 = significant_sites['methylation_site'].tolist()

# Select significant features from the training and testing data
#X_train_significant = X_train[significant_sites_01]
#X_test_significant = X_test[significant_sites_01]

(136, 5480)
(35, 5480)
(137,)
(34,)


## this is the random forest classifier

In [89]:
#random forest classifier
#rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state=42)
#kf = KFold(n_splits = 10, shuffle=True, random_state = 42)
 
# Train the model on the training dat a
rf_classifier.fit(X_train_significant, y_train)

y_pred = rf_classifier.predict(X_test_significant)
y_pred_prob = rf_classifier.predict_proba(X_test_significant)[:, 1]

ValueError: Found input variables with inconsistent numbers of samples: [136, 137]

In [76]:
"""#random forest classifier
#rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators = 100, max_depth = 5, class_weight = 'balanced', random_state=42)
kf = KFold(n_splits = 5, shuffle=True, random_state = 42)
accuracies = []
conf_matrices = []
class_reports = []
auc_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train_significant, X_test_significant = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the classifier on the training data
    rf_classifier.fit(X_train_significant, y_train)
    
    # Predict on the test data
    y_pred = rf_classifier.predict(X_test_significant)
    y_pred_prob = rf_classifier.predict_proba(X_test_significant)[:, 1]
    # Evaluate the predictions
    accuracies.append(accuracy_score(y_test, y_pred))
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    class_reports.append(classification_report(y_test, y_pred, output_dict=True, zero_division=0))
    auc_scores.append(roc_auc_score(y_test, y_pred_prob))
    
# Compute average accuracy and AUC
average_accuracy = sum(accuracies) / len(accuracies)
average_auc = sum(auc_scores) / len(auc_scores)
print(f'Average Accuracy: {average_accuracy}')
print(f'Average AUC: {average_auc}')"""

KeyboardInterrupt: 

In [25]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of trees in the forest
    'max_depth': [5, 10, 20, None],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],      # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],        # Minimum samples required at each leaf node
    'max_features': ['auto', 'sqrt']     # Number of features to consider at each split
}

rf_classifier_test = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier_test, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_significant, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

best_rf_classifier = grid_search.best_estimator_
best_rf_classifier.fit(X_train_significant, y_train)

540 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/u/home/c/ctang04/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/u/home/c/ctang04/.local/lib/python3.9/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/u/home/c/ctang04/.local/lib/python3.9/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/u/home/c/ctang04/.local/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_para

Best Parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.8309523809523809


In [90]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8824
Precision: 0.5000
Recall: 0.2500
F1 Score: 0.3333
ROC AUC Score: 0.5583
Confusion Matrix:
[[29  1]
 [ 3  1]]


# SVM classifier

In [29]:
svm = SVC(probability=True, random_state=42)

# Train the SVM model
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)
y_pred_prob = svm.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = svm.score(X_test, y_test)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_prob))

Accuracy: 0.7142857142857143
Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83        25
           1       0.00      0.00      0.00        10

    accuracy                           0.71        35
   macro avg       0.36      0.50      0.42        35
weighted avg       0.51      0.71      0.60        35

Confusion Matrix:
 [[25  0]
 [10  0]]
ROC AUC Score: 0.824


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# Retrain the SVM model on reduced data
svm.fit(X_train, y_train)

# Create a mesh grid for plotting decision boundary
h = .02  # step size in the mesh
x_min, x_max = X_reduced[:, 0].min() - 1, X_reduced[:, 0].max() + 1
y_min, y_max = X_reduced[:, 1].min() - 1, X_reduced[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict classifications for each point in the mesh grid
Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary and margins
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

# Plot the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.coolwarm, edgecolors='k', s=20)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.coolwarm, edgecolors='k', s=20, marker='x')

# Add labels
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.title('SVM Decision Boundary')

# Show plot
plt.show()