In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.precision', 2)
import math

In [2]:
# Let's read the PFM file first to understand its structure
file_path = '/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/Core_promoter/test.pfm'

In [3]:
# Parsing the PFM data from the text file
pfm = {}
nucleotides = ['A', 'C', 'G', 'T']

with open(file_path, 'r') as file:
    pfm_data = file.readlines()[1:]  # Skip the first line (header)

for line in pfm_data:
    parts = line.split()
    nt = parts[0]
    counts = [float(count) for count in parts[1:]]

    if nt in nucleotides:
        for i, count in enumerate(counts):
            if i not in pfm:
                pfm[i] = {}
            pfm[i][nt] = count

In [4]:
pfm

{0: {'A': 18.0, 'C': 8.0, 'G': 13.0, 'T': 7.0},
 1: {'A': 8.0, 'C': 3.0, 'G': 31.0, 'T': 4.0},
 2: {'A': 5.0, 'C': 3.0, 'G': 34.0, 'T': 4.0},
 3: {'A': 4.0, 'C': 9.0, 'G': 9.0, 'T': 24.0},
 4: {'A': 1.0, 'C': 33.0, 'G': 8.0, 'T': 4.0},
 5: {'A': 29.0, 'C': 4.0, 'G': 10.0, 'T': 3.0},
 6: {'A': 7.0, 'C': 21.0, 'G': 11.0, 'T': 7.0},
 7: {'A': 7.0, 'C': 15.0, 'G': 15.0, 'T': 9.0},
 8: {'A': 7.0, 'C': 14.0, 'G': 19.0, 'T': 6.0},
 9: {'A': 0.0, 'C': 0.0, 'G': 4.0, 'T': 42.0},
 10: {'A': 1.0, 'C': 0.0, 'G': 44.0, 'T': 1.0},
 11: {'A': 39.0, 'C': 1.0, 'G': 3.0, 'T': 3.0},
 12: {'A': 1.0, 'C': 43.0, 'G': 0.0, 'T': 2.0},
 13: {'A': 1.0, 'C': 39.0, 'G': 1.0, 'T': 5.0}}

In [5]:
# Background nucleotide probabilities (assuming equal distribution)
background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}

# Convert PFM to probabilities and then to PWM (log-likelihood ratios)
pwm = {}
for pos, counts in pfm.items():
    #print(pos, counts)
    total_counts = sum(counts.values())
    #print(total_counts)
    pwm[pos] = {}
    for nt, count in counts.items():
        #print(nt, count)
        probability = (count+ (math.sqrt(total_counts)/4)) / (total_counts+math.sqrt(total_counts))
        #print(probability, background[nt])
        pwm[pos][nt] = np.log2(probability / background[nt])

In [6]:
pwm_df = pd.DataFrame.from_dict(pwm).reset_index(drop=False)
#pwm_df.reset_index(inplace=True)
pwm_df = pwm_df.set_index('index')

# Transpose DataFrame to make columns as positions and rows as nucleotides
pwm_df = pwm_df.T
pwm_df

index,A,C,G,T
0,0.58,-0.44,0.16,-0.6
1,-0.44,-1.49,1.31,-1.21
2,-0.98,-1.49,1.44,-1.21
3,-1.21,-0.3,-0.3,0.96
4,-2.29,1.39,-0.44,-1.21
5,1.22,-1.21,-0.17,-1.49
6,-0.6,0.78,-0.06,-0.6
7,-0.6,0.34,0.34,-0.3
8,-0.6,0.25,0.65,-0.78
9,-2.96,-2.96,-1.21,1.73


In [16]:
pwm_df['A'][0]

0.5778170479052248

In [19]:
sequence = "GGGTCAGCATGGCC" 

In [21]:
absolute_score = sum(pwm_df[nt][pos] for pos, nt in enumerate(sequence))
absolute_score

11.56982691167751

In [25]:
min_possible_score = pwm_df.min(axis=1).sum()
max_possible_score = pwm_df.max(axis=1).sum()

In [26]:
# Calculate Relative Score
relative_score = (absolute_score - min_possible_score) / (max_possible_score - min_possible_score)
relative_score

0.8634838824267685