In [None]:
import pandas as pd

# Load exon lengths (CSV file)
# Assuming your CSV file has columns: 'gene_id' and 'exon_length'
exon_lengths_df = pd.read_csv('path_to_exon_lengths.csv')

# Load the gene count matrix (txt file)
# First column is 'ENSEMBL_ID' and the remaining columns are sample counts
gene_counts_df = pd.read_csv('path_to_count_matrix.txt', sep='\t')

# Merge exon lengths with the gene counts dataframe on 'ENSEMBL_ID'
# Renaming 'ENSEMBL_ID' to 'gene_id' to match exon lengths dataframe
gene_counts_df = gene_counts_df.rename(columns={'ENSEMBL_ID': 'gene_id'})
merged_df = pd.merge(gene_counts_df, exon_lengths_df, on='gene_id')

# Convert gene length to kilobases (kb)
merged_df['length_kb'] = merged_df['exon_length'] / 1000

# For each sample, calculate TPM
sample_columns = merged_df.columns.difference(['gene_id', 'exon_length', 'length_kb'])

# Create a dataframe to store TPM values
tpm_df = pd.DataFrame({'gene_id': merged_df['gene_id']})

for sample in sample_columns:
    # Calculate RPK (Reads Per Kilobase) for this sample
    merged_df['RPK'] = merged_df[sample] / merged_df['length_kb']
    
    # Sum all RPKs to get the total RPK for this sample
    total_rpk = merged_df['RPK'].sum()
    
    # Calculate TPM (Transcripts Per Million) for this sample
    merged_df['TPM'] = (merged_df['RPK'] / total_rpk) * 1e6
    
    # Store TPM values in tpm_df
    tpm_df[sample] = merged_df['TPM']

# Save the TPM results to a new CSV file
tpm_df.to_csv('tpm_results.csv', index=False)

# Check the first few rows of the TPM result
print(tpm_df.head())
