In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define file paths
gene_counts_file = "./output/rna/norm_tmm.tsv"  # Path to the TMM normalized gene counts
metadata_file = "/standard/vol185/cphg_Manichaikul/users/csm6hg/metadata/metadata_10_17_2024_CSM.txt"  # Path to your metadata file
coloc_file = "./output/coloc/coloc_eqtl_candidates_full.txt"

# Read in the gene counts data
gene_counts = pd.read_csv(gene_counts_file, sep='\t', index_col=0) # index_col=0 uses the first column as the index

# Read in the metadata
metadata = pd.read_csv(metadata_file, sep='\t')

# Read in coloc
coloc = pd.read_csv(coloc_file, sep='\t')
#coloc = coloc[coloc['PP.H4'] > 0.5]['gene'] # Extract best genes

# Display the first few rows of each dataframe to verify
print("Gene Counts Data:")
#print(gene_counts.head())
print("\nMetadata:")
#print(metadata.head())
print("\nColoc")
print(coloc.head())

Gene Counts Data:

Metadata:

Coloc
76      ENSG00000157933
116     ENSG00000097021
1753    ENSG00000066027
1923    ENSG00000077522
2202    ENSG00000204161
Name: gene, dtype: object


In [33]:
# In this case, the common column is the column 'SAMPLE_ID_TOR' in metadata and the header of gene_counts

# Transpose gene_counts so sample IDs are in the index
gene_counts_transposed = gene_counts.transpose()

# Reset index of gene_counts_transposed to make sample IDs a column
gene_counts_transposed = gene_counts_transposed.reset_index(names = ['gene'])
#print(gene)

# Rename the index column to 'SAMPLE_ID_TOR' to match the metadata
gene_counts_transposed = gene_counts_transposed.rename(columns={'gene': 'SAMPLE_ID_TOR'})

# Merge the two dataframes using 'SAMPLE_ID_TOR'
merged_data = pd.merge(gene_counts_transposed, metadata, on='SAMPLE_ID_TOR', how='inner')

# Display the first few rows of the merged dataframe
print("Merged Data:")
print(merged_data.head())

Merged Data:
  SAMPLE_ID_TOR         0         1         2         3         4         5  \
0     TOR803269  0.159183 -1.764953 -0.433821  0.320307 -0.329219  -1.80659   
1     TOR795199 -0.568076  0.857387 -2.291709 -0.133547 -0.112252 -1.851616   
2     TOR480443 -0.142082 -0.649492 -0.387826 -0.044398 -0.302559 -1.927134   
3     TOR487285 -0.986725 -0.746589  0.959577  0.875847  0.176331 -0.351616   
4     TOR565416 -0.593092 -0.074041 -0.276112 -0.443128   0.55817 -1.900744   

          6         7         8  ... Collection_year  \
0  0.588059 -1.119166 -0.675817  ...             NaN   
1  0.401534 -1.210806 -0.476014  ...             NaN   
2 -0.112252 -1.058011 -0.315861  ...             NaN   
3  0.210789  0.618484  1.851616  ...          2010.0   
4  0.086763 -1.592384 -0.176331  ...          2010.0   

  Biosample_affection_status Analyte_isolation_lab ANALYTE_TYPE  \
0                        NaN                   NaN          NaN   
1                        NaN             

In [None]:
# List the genes you want to analyze
gene_list = ['ENSG00000000003.14', 'ENSG00000000005.5', 'ENSG00000000419.12']  # Replace with your genes of interest

# Verify that the genes are in the columns
available_genes = [gene for gene in gene_list if gene in merged_data.columns]
print(f"Available genes: {available_genes}")

# Example: Plotting the distribution of a specific gene
if available_genes:
    gene_to_plot = available_genes[0]
    plt.figure(figsize=(10, 6))
    sns.histplot(merged_data[gene_to_plot], kde=True)
    plt.title(f'Distribution of {gene_to_plot}')
    plt.xlabel('Normalized Gene Counts')
    plt.ylabel('Frequency')
    plt.show()

    # Example: Scatter plot of gene expression vs. a metadata variable (e.g., age)
    if 'age' in merged_data.columns:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='age', y=gene_to_plot, data=merged_data)
        plt.title(f'{gene_to_plot} vs. Age')
        plt.xlabel('Age')
        plt.ylabel('Normalized Gene Counts')
        plt.show()
    else:
        print("Age column not found in metadata.")
else:
    print("No specified genes available in the merged dataset.")

In [None]:
# Additional analyses can be added here
# For example, you can perform statistical tests, create more complex plots, etc.