In [None]:
# Precompute lenghts from GTF annot
annot_file = "./Mus_musculus.GRCm39.113.chr.gtf"
lengths_file = "./gene_lengths.tsv"
# OE
count_matrix_OE = "./nstrand_deseq_0426_093139/deseq2_results_OE.csv"
output_fpkm_OE= "/nstrand_deseq_0426_093139/FPKM_OE.tsv"
# VNO
count_matrix_VNO = "./nstrand_deseq_0426_093139/deseq2_results_VNO.csv"
output_fpkm_VNO= "./nstrand_deseq_0426_093139/FPKM_VNO.tsv"

In [None]:
import pandas as pd
import re

In [None]:
def process_line(line):
    """Process a single line: compute (col1 - col2 +1) and extract gene_id."""
    columns = line.strip().split()  # Split by whitespace
    
    try:
        if columns[2]=='gene':
            start = int(columns[3]) 
            end = int(columns[4])
            difference = end - start + 1
            # Extract gene_id using regex
            gene_id_match = re.search(r'gene_id "([^"]+)"', line)
            gene_id = gene_id_match.group(1) if gene_id_match else "NOT_FOUND"

            return f"{gene_id}\t{difference}"
    except (ValueError, IndexError):
        return None  # Skip if columns are not integers

    

def precompute_lengths(input_file, output_file):
    """Process input file and write results to output."""
    print("Counting gene lengths in file", input_file, "...")
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith('#'):  # Skip comment lines
                continue
            processed = process_line(line)
            if processed:
                outfile.write(processed + '\n')
    print("Gene lengths stored into", output_file)


In [None]:
def FPKM_normalization(count_file, lenghts_file, output_file):
    print("Normalizing file", count_file, "...")
    df = pd.read_csv(count_file,  index_col=0)
    df = df[['baseMean']].sort_index()
    # Step 1: get your per million scaling factor
    perML_factor = df.sum() / 1e6
    df['RPM'] = df / perML_factor
    df=df.reset_index().sort_index()
    lengths= pd.read_csv(lenghts_file, sep='\t', header=None, names=['index', 'length'])
    lengths['length_kb'] = lengths['length'] / 1e3

    merged_df = pd.merge(df, lengths, on="index")
    merged_df['RPKM'] = merged_df['RPM']/merged_df['length_kb']

    merged_df.to_csv(output_file, sep='\t', index=False, header=False, columns=['index','RPKM'])
    print("Normalized counts stored into", output_file)

In [None]:

precompute_lengths(annot_file, lengths_file)
FPKM_normalization(count_matrix_OE, lengths_file, output_fpkm_OE)
FPKM_normalization(count_matrix_VNO, lengths_file, output_fpkm_VNO)