In [149]:
import pysam
import pybedtools
import pandas
from sklearn.cluster import KMeans

# inputs
motifs_file = './input-tests/motifs.random100lines.chr10.txt'
bam_file = './input-tests/patho_48hrs_2_dedup_mitRM_mapq30p_sorted.bam'



In [130]:
class Motifs:

    """
    
    
    input here should be motifs in peaks already
    
    
    
    
    -----------------
    Attributes
    -----------------
    
    bam_file: alignment file in BAM format containing reads from an ATAC-seq experiment;
              file should be sorted and indexed.
    
    motifs_file: transcription factor motifs in open regions of the genome formatted as:
                 motif-ID chromosome start end strand score p-value q-value matched-sequence
                 tab delimited, no header, should include only motifs in peaks
    
    motifs: hold a dictionary with motifs from motifs_file;
            ---
            {TF1:[(occurence1_chrom, occurence1_start, occurence1_end, occurence1_p-value, occurence1_strand),
                  (occurence2_chrom, occurence2_start, occurence2_end, occurence2_p-value, occurence2_strand)],
             TF2:[(occurence1_chrom, occurence1_start, occurence1_end, occurence1_p-value, occurence1_strand)]...}        
            ---
            Keys: motif IDs -- will be used later to build the prior, thus I use transcription factors as IDs.
            Values: list of occurences in the genome of the motif(s) associated with identifier.

    insertion_counts: 
    
    tn5_9mer_counts:
    
    motif_clusters_insertion_counts:

    motif_clusters_9mer_counts:
    
    
    """

    def __init__(self, bam_file, motifs_file):
        
        """
        
        
        """
        
        self.bam_file = bam_file
        self.motifs_file = motifs_file 
        self.motifs = {}
        self.insertion_counts = {}
        self.tn5_9mer_counts = {}
        self.motif_clusters_insertion_counts = {}
        self.motif_clusters_9mer_counts = {}
        #self.cluster = True #False
        #self.motif_clusters = []
        
    def parse_motifs_file(self):
        
        """
        from motifs_file to self.motifs
        """
        
        motifs_file = self.motifs_file

        with open(motifs_file, "r") as motifs_handle:

            for motif_occurence in motifs_handle:

                motif_occurence = motif_occurence.split()

                motif_id = motif_occurence[0]
                chrom = motif_occurence[1]
                start = motif_occurence[2]
                end = motif_occurence[3]
                strand = motif_occurence[4]
                score = motif_occurence[6] # p-value

                to_bed = [chrom, int(start) - 1, int(end) - 1, motif_id, score, strand]

                if motif_id in self.motifs:
                    self.motifs[motif_id].append(tuple(to_bed))
                else:
                    self.motifs[motif_id] = [tuple(to_bed)]
    

    @staticmethod 
    def get_insertions(bam_handle, chrom, start, end, upstream = 50, downstream = 50):
        
        """
        returns number of 
        """
        

        # initialize counts vector at 0 for all positions
        region_length = (end - start) + 1
        insertion_counts = [0]*region_length

        # fetch reads mapping to the region specified as input
        reads = bam_handle.fetch(chrom, start, end)

        # each read represents a potential insertion within region
        for read in reads:
            
            if read.is_reverse: 
                # offset by 5 bp
                insertion_pos = read.reference_end - 5 
            else:
                # offset by 4 bp
                insertion_pos = read.reference_start + 4
            
            pos = insertion_pos - start 
            
            # make sure pos is within region
            if pos in range(0, region_length):
                insertion_counts[pos] += 1  
                    
        return tuple(insertion_counts)
        
    @staticmethod
    def get_tn5_9mer(insertion_counts, up_offset = 4, down_offset = 5):
        
        """
        
        """
    
        tn5_9mer_counts = list(insertion_counts)

        region = range(0, len(insertion_counts))

        for pos in region:
            for idx in range(pos - up_offset, pos + down_offset + 1):
                if idx in region and idx != pos:
                    tn5_9mer_counts[idx] += insertion_counts[pos]
        
        return tuple(tn5_9mer_counts)
            
    
    def compute_scores_matrices(self, upstream = 50, downstream = 50):
        
        """
        
        """
        
        self.parse_motifs_file()
        
        bam_handle = pysam.AlignmentFile(self.bam_file, "rb")
        
        for motif_id in self.motifs.keys(): ## Paralelize
            
            insertion_counts_mat = []
            tn5_9mer_counts_mat = []
            rownames_motifs = []
            
            motifs = pybedtools.Bedtool(self.motifs[motif_id])
            motifs = motifs.sort()
            
            for motif in motifs:
                
                # get motif coordinates -- FIMO output is a closed interval, so add 1 to end (python)
                chrom = motif.chrom
                center = round(stat.median(range(motif.start, motif.end + 1)))
                start = center - upstream
                end = center + downstream
                
                insertion_counts = get_insertions(bam_handle, chrom, start, end)
                insertion_counts_mat.append(insertion_counts)
                
                tn5_9mer_counts = get_tn5_9mer(insertion_counts)
                tn5_9mer_counts_mat.append(tn5_9mer_counts)
                
                motif_holder = "_".join([motif.chrom, str(motif.start), str(motif.end), motif.name])
                rownames_motifs.append(motif_holder)
                
            
            insertion_counts_df = pandas.DataFrame(insertion_counts_mat, 
                                                   index = rownames_motifs)
            
            tn5_9mer_counts_df = pandas.DataFrame(tn5_9mer_counts_mat, 
                                                  index = rownames_motifs)
            
            self.insertion_counts[motif_id] = insertion_counts_df
            self.tn5_9mer_counts[motif_id] = tn5_9mer_counts_df
            

    def cluster_motifs(self):
        
        ## Paralelize, as of now I am going to do both ins and 9mer (choose one later?)
        
        # insertion counts
        for motif_holder in self.insertion_counts.keys:
            
            insertion_counts_df = self.insertion_counts[motif_id]
            insertion_counts_mat = insertion_counts_df.as_matrix()

            motif_clusters = KMeans(n_clusters = 2)
            motif_clusters.fit(insertion_counts_mat)
            labels = motif_clusters.labels_
            results = pandas.DataFrame([insertion_counts_df.index, labels]).T

            self.motif_clusters_insertion_counts[motif_holder] = results
            
            
        # 9mers
        for motif_holder in self.tn5_9mer_counts.keys:
            
            tn5_9mer_counts_df = self.tn5_9mer_counts[motif_id]

            tn5_9mer_counts_mat = tn5_9mer_counts_df.as_matrix()

            motif_clusters = KMeans(n_clusters = 2)
            motif_clusters.fit(tn5_9mer_counts_mat)
            labels = motif_clusters.labels_
            results = pandas.DataFrame([tn5_9mer_counts_df.index, labels]).T
            
            self.motif_clusters_9mer_counts[motif_holder] = results
        


In [139]:
example = Motifs(bam_file, motifs_file)
example.parse_motifs_file()
bam_handle = pysam.AlignmentFile(example.bam_file, "rb")
motif_id = example.motifs.keys()[5]
chrom = 'chr1'
start = 20784345
end = 20784360
Motifs.get_insertions(bam_handle, chrom, start, end)


insertion_counts_matrix = [(9,0,0,0,4,3,5), (1,0,0,4,2,1,0), (0,1,0,0,0,0,2), (0,0,0,0,4,1,5)]
rownames_motifs = ['chr10_10114679_10114693_M5483_1.01', 'chr10_10114679_10114693_M5483_1.02', 
                  'chr10_10114679_10114693_M5483_1.03', 'chr10_10114679_10114693_M5483_1.04']

insertion_counts_matrix = pandas.DataFrame(insertion_counts_matrix, index = rownames_motifs)

In [140]:
insertion_counts_matrix

Unnamed: 0,0,1,2,3,4,5,6
chr10_10114679_10114693_M5483_1.01,9,0,0,0,4,3,5
chr10_10114679_10114693_M5483_1.02,1,0,0,4,2,1,0
chr10_10114679_10114693_M5483_1.03,0,1,0,0,0,0,2
chr10_10114679_10114693_M5483_1.04,0,0,0,0,4,1,5


In [141]:
dataset_array = insertion_counts_matrix.values
print(dataset_array.dtype)
print(dataset_array)




int64
[[9 0 0 0 4 3 5]
 [1 0 0 4 2 1 0]
 [0 1 0 0 0 0 2]
 [0 0 0 0 4 1 5]]


In [158]:
dataset = insertion_counts_matrix

# Convert DataFrame to matrix
mat = dataset.as_matrix()
# Using sklearn
km = KMeans(n_clusters = 3)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pandas.DataFrame([dataset.index,labels]).T

In [163]:
pandas.DataFrame([dataset.index,labels])

Unnamed: 0,0,1,2,3
0,chr10_10114679_10114693_M5483_1.01,chr10_10114679_10114693_M5483_1.02,chr10_10114679_10114693_M5483_1.03,chr10_10114679_10114693_M5483_1.04
1,0,1,1,2


In [153]:
dataset

Unnamed: 0,0,1,2,3,4,5,6
chr10_10114679_10114693_M5483_1.01,9,0,0,0,4,3,5
chr10_10114679_10114693_M5483_1.02,1,0,0,4,2,1,0
chr10_10114679_10114693_M5483_1.03,0,1,0,0,0,0,2
chr10_10114679_10114693_M5483_1.04,0,0,0,0,4,1,5


In [148]:
dataset.index

Index([u'chr10_10114679_10114693_M5483_1.01',
       u'chr10_10114679_10114693_M5483_1.02',
       u'chr10_10114679_10114693_M5483_1.03',
       u'chr10_10114679_10114693_M5483_1.04'],
      dtype='object')

In [None]:


#### GENERATING THE PRIOR...

        #
        motifs = pybedtools.Bedtool(self.motifs_bed)
        motifs = motifs.sort()

        #
        peaks = pybedtools.Bedtool(self.peaks_bed)
        peaks = peaks.sort()

        #
        genes_tss = pybedtools.BedTool(self.tss_bed)
        genes_tss = genes_tss.sort()

        # assign peaks to closes feature in feature file
        peaks_to_genes = peaks.closest(genes_tss, D = 'b')
        motifs_to_genes = motifs.closest(peaks_to_genes)
    