In [None]:
#pip install magic-impute umap-learn scanpy

In [6]:
import pandas as pd
import numpy as np
import scanpy as sc
from magic import MAGIC
import umap
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# FOR COMMON DATA
homepath_common = './common'
cancerpath_common = './common/cancer'
normalpath_common = './common/normal'

smoothed_cancer_common = './common/smoothed_cancer_magic'
smoothed_normal_common = './common/smoothed_normal_magic'

In [11]:
# FOR RARE DATA
homepath_rare = './rare'
cancerpath_rare = './rare/cancer'
normalpath_rare = './rare/normal'

smoothed_cancer_rare = './rare/smoothed_cancer_magic'
smoothed_normal_rare = './rare/smoothed_normal_magic'

In [8]:
def process_files(input_dir, output_dir, data_type):
    # Loop over each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.gz'):  # or any other file extension you expect
            file_path = os.path.join(input_dir, filename)
            
            # Load the data
            data = pd.read_csv(file_path, compression='gzip', index_col=0)

            # Check if the data is read correctly and is numeric
            print(f"Processing {data_type} data: {filename}...")
            #print(f"{data_type} data types after loading:")
            #print(data.dtypes)
            
            # Convert the DataFrame to an AnnData object
            adata = sc.AnnData(data)
            
            # Step 3: MAGIC Smoothing
            magic_op = MAGIC()
            adata_magic = magic_op.fit_transform(adata.X)
            
            # Update the AnnData object with the smoothed data
            adata.obsm['X_magic'] = adata_magic
            
            # Create a DataFrame from the smoothed data
            smoothed_df = pd.DataFrame(adata_magic, index=data.index, columns=data.columns)
            
            # Verify final smoothed data shape and types
            print("Final smoothed data shape:", smoothed_df.shape)
            #print("Final smoothed data types:")
            #print(smoothed_df.dtypes)
            
            # Generate the output file path
            output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_smoothed.txt.bz2")
            
            # Step 7: Save the final file
            smoothed_df.to_csv(output_file_path, sep='\t', header=False, index=False, compression='bz2')

            # Generate the output file path
            output_file_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}_smoothed.h5ad")
            
            # Step 8: Save the AnnData object in .h5ad format with gzip compression
            adata.write_h5ad(output_file_path, compression='gzip')

            print(f"Saved smoothed data to {output_file_path}\n")

In [12]:
# Process cancer data files
process_files(cancerpath_rare, smoothed_cancer_rare, "cancer")

Processing cancer data: SARC.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 265 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 1.79 seconds.
    Calculating KNN search...
    Calculated KNN search in 89.75 seconds.
    Calculating affinities...




    Calculated affinities in 97.92 seconds.
  Calculated graph and diffusion operator in 189.51 seconds.
  Calculating imputation...




  Calculated imputation in 0.75 seconds.
Calculated MAGIC in 190.26 seconds.
Final smoothed data shape: (20501, 265)
Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_cancer_magic/SARC.csv_smoothed.h5ad

Processing cancer data: THYM.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 122 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 1.01 seconds.
    Calculating KNN search...
    Calculated KNN search in 79.34 seconds.
    Calculating affinities...




    Calculated affinities in 78.99 seconds.
  Calculated graph and diffusion operator in 159.39 seconds.
  Calculating imputation...




  Calculated imputation in 0.43 seconds.
Calculated MAGIC in 159.82 seconds.
Final smoothed data shape: (20501, 122)
Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_cancer_magic/THYM.csv_smoothed.h5ad

Processing cancer data: CHOL.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 45 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 38.22 seconds.
    Calculating affinities...




    Calculated affinities in 40.58 seconds.
  Calculated graph and diffusion operator in 78.84 seconds.
  Calculating imputation...
  Calculated imputation in 0.12 seconds.
Calculated MAGIC in 78.97 seconds.
Final smoothed data shape: (20501, 45)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_cancer_magic/CHOL.csv_smoothed.h5ad

Processing cancer data: PCPG.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 187 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 1.16 seconds.
    Calculating KNN search...
    Calculated KNN search in 78.30 seconds.
    Calculating affinities...




    Calculated affinities in 82.73 seconds.
  Calculated graph and diffusion operator in 162.22 seconds.
  Calculating imputation...




  Calculated imputation in 0.63 seconds.
Calculated MAGIC in 162.86 seconds.
Final smoothed data shape: (20501, 187)
Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_cancer_magic/PCPG.csv_smoothed.h5ad

Processing cancer data: KICH.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 91 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 91.74 seconds.
    Calculating affinities...




    Calculated affinities in 90.33 seconds.
  Calculated graph and diffusion operator in 182.11 seconds.
  Calculating imputation...




  Calculated imputation in 0.29 seconds.
Calculated MAGIC in 182.41 seconds.
Final smoothed data shape: (20501, 91)
Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_cancer_magic/KICH.csv_smoothed.h5ad



In [13]:
# Process normal data files
process_files(normalpath_rare, smoothed_normal_rare, "normal")

Processing normal data: PCPG.norm.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 3 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...


    Calculated KNN search in 0.69 seconds.
    Calculating affinities...




    Calculated affinities in 0.65 seconds.
  Calculated graph and diffusion operator in 1.38 seconds.
  Calculating imputation...
  Calculated imputation in 0.01 seconds.
Calculated MAGIC in 1.40 seconds.
Final smoothed data shape: (20501, 3)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_normal_magic/PCPG.norm.csv_smoothed.h5ad

Processing normal data: THYM.norm.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 2 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.33 seconds.
    Calculating affinities...




    Calculated affinities in 0.40 seconds.
  Calculated graph and diffusion operator in 0.76 seconds.
  Calculating imputation...
Calculated MAGIC in 0.77 seconds.
Final smoothed data shape: (20501, 2)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_normal_magic/THYM.norm.csv_smoothed.h5ad

Processing normal data: SARC.norm.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 2 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 0.34 seconds.
    Calculating affinities...




    Calculated affinities in 0.52 seconds.
  Calculated graph and diffusion operator in 0.91 seconds.
  Calculating imputation...
Calculated MAGIC in 0.93 seconds.
Final smoothed data shape: (20501, 2)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_normal_magic/SARC.norm.csv_smoothed.h5ad

Processing normal data: KICH.norm.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 25 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 10.38 seconds.
    Calculating affinities...




    Calculated affinities in 11.00 seconds.
  Calculated graph and diffusion operator in 21.42 seconds.
  Calculating imputation...
  Calculated imputation in 0.05 seconds.
Calculated MAGIC in 21.48 seconds.
Final smoothed data shape: (20501, 25)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_normal_magic/KICH.norm.csv_smoothed.h5ad

Processing normal data: CHOL.norm.csv.gz...
Calculating MAGIC...
  Running MAGIC on 20501 cells and 9 genes.
  Calculating graph and diffusion operator...
    Calculating KNN search...
    Calculated KNN search in 2.95 seconds.
    Calculating affinities...




    Calculated affinities in 2.84 seconds.
  Calculated graph and diffusion operator in 5.83 seconds.
  Calculating imputation...
  Calculated imputation in 0.02 seconds.
Calculated MAGIC in 5.85 seconds.
Final smoothed data shape: (20501, 9)




Saved smoothed data to /home/marintina/Desktop/Μεταπτυχιακό/2nd semester/ML in Computational Biology/MLCB_project/dataset/grouped data/rare/smoothed_normal_magic/CHOL.norm.csv_smoothed.h5ad



---------------

-----------

## UMAP dimensionality reduction

In [None]:
# Step 4: UMAP Dimensionality Reduction (use UMAP from umap-learn library)
reducer = umap.UMAP()
embedding = reducer.fit_transform(adata.obsm['X_magic'])
adata.obsm['X_umap'] = embedding

In [None]:
umap_coords = adata.obsm['X_umap']
print("UMAP coordinates shape:", umap_coords.shape)

In [None]:
# Step 7: Visualization
# Plot the UMAP results
plt.figure(figsize=(10, 8))
sns.scatterplot(x=embedding[:, 0], y=embedding[:, 1], s=50)
plt.title('UMAP of MAGIC-smoothed data')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.show()

In [None]:
# Step 5: Clustering
sc.pp.neighbors(adata, n_neighbors=20, use_rep='X_magic')
sc.tl.leiden(adata, resolution=0.5)

# Extract UMAP coordinates
umap_df = pd.DataFrame(umap_coords, index=adata.obs.index, columns=['UMAP1', 'UMAP2'])

# Add clustering information to the DataFrame
umap_df['leiden'] = adata.obs['leiden']

In [None]:
# Step 6: Visualization
# Plot the UMAP results with clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x=umap_df['UMAP1'], y=umap_df['UMAP2'], hue=umap_df['leiden'], palette='tab10', s=50)
plt.title('UMAP of MAGIC-smoothed data with Leiden Clusters')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.legend(title='Leiden Cluster')
plt.show()