In [1]:
import os
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from collections import Counter
import random
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import glob
from multiprocessing import Pool

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
def readingBedfiles(path):
    content = []
    fileList = sorted([f for f in glob.glob(path)])
    dfs = []
    for file in fileList:
        content = []
        with open(file, 'r') as f:
            for line in f:
                content.append(line.strip().split())
            df = pd.DataFrame(content) 
            df = df.drop(columns = [3, 4, 5, 6, 7, 8, 9])
            dfs.append(df)
    print('Reading Files Done') 
    return dfs

In [3]:
def to_ranges(iterable):
    iterable = sorted(set(iterable))
    for key, group in itertools.groupby(enumerate(iterable),
                                        lambda t: t[1] - t[0]):
        
        group = list(group)
        yield group[0][1], group[-1][1]

In [4]:
def document_creation_loc(dfs, chr_name, fileList):
    minimum = []
    maximun = []
#     print('chr_name', chr_name)
    document = {}
    fileName = -1
    for df in dfs:
        
        fileName += 1
#         print(fileList[fileName])
        df_chr = df.loc[df[0] == 'chr' + str(chr_name)]#.sort_values(by=[1, 2])
        df_chr[1] = df_chr[1].astype(int)
        df_chr[2] = df_chr[2].astype(int)
        df_chr = df_chr.sort_values(by=[1, 2])
#         print(df_chr)
        if(len(df_chr)>0):       
            minimum.append(int((df_chr).iloc[0][1]))
            maximun.append(int((df_chr).iloc[-1][2]))
            document['chr' + str(chr_name) + '_' + str(fileList[fileName].split('.')[0])] = (list(itertools.chain.from_iterable(list(map(lambda x : list(range(x[0],x[1]+1)), df_chr[[1,2]].values.astype(int))))))
            
#     print('min', min(minimum))
#     print('max', max(maximun))
    return document

In [5]:
#Prepare Data for Vectorization
def documentPrepforVec(document):
    dictlist = []
    for key, value in document.items():
        dictlist.append(value)
    dictlist = [' '.join([str(x) for x in doc]) for doc in dictlist]
    return dictlist

In [6]:
#Tf_idf Vectorization document
def tf_idfVect_document(train_doclist, test_doclist, max_feature):
    vectorizer = TfidfVectorizer(max_features = max_feature)
    X_train = vectorizer.fit_transform(train_doclist)
    X_test = vectorizer.transform(test_doclist)
    
    #print(vectorizer.get_feature_names())
#     print('shapeOfMatrix', X_train.shape, end = ' ')
#     print('shapeOfMatrix', X_test.shape, end = ' ')
    return X_train, X_test

In [7]:
#Tf_idf Vectorization
def tf_idfVect(dictlist, max_feature):
    vectorizer = TfidfVectorizer(max_features = max_feature)
    X = vectorizer.fit_transform(dictlist)
    #print(vectorizer.get_feature_names())
#     print('shapeOfMatrix', X.shape)
    return vectorizer

In [8]:
# Locus Labeling
def Loc_labeling(vectorizer, chr_name):
    featureRange = list(to_ranges(int(x) for x in vectorizer.get_feature_names()))
    #print(listofranges)
#     print('featureRangeSize', len(featureRange))
    feature_ranges = pd.DataFrame(featureRange)
    feature_ranges['len'] = feature_ranges[1] - feature_ranges[0] + 1
    feature_ranges = feature_ranges[feature_ranges['len']>100].reset_index()
    feature_ranges['chrom'] = 'chr'+ str(chr_name) 
    feature_ranges = feature_ranges.rename(columns = {0:'start', 1:'end'})
    feature_ranges = feature_ranges.drop(columns= ['len'])
    return feature_ranges


In [9]:
# Document Creation
def document_creation_word(feature_ranges, document):
    list_feature_ranges = feature_ranges['range']
    document_text = pd.DataFrame(columns=['doc_name', 'text'])
    for doc in document.keys():
        #print(doc)
        doc_text = ''
        word_in_doc = ([int(np.mean(rng)) for rng in list(to_ranges(document[doc]))])
        for word in word_in_doc:
            founded = ([i for i, lst in enumerate(list_feature_ranges) if word in lst])
            if(len(founded)>0):
                doc_text = doc_text + ' ' + feature_ranges.iloc[founded]['word'].iloc[0]
        document_text = document_text.append(pd.DataFrame([[doc, doc_text]], columns=['doc_name', 'text']))
    print('lenWordDocument', len(document_text))
    return document_text
    

In [10]:
def tf_idf_chrom(listofparams):
    
    dfs = listofparams[0]
    chr_name = listofparams[1]
    fileList = listofparams[2]
    maxfeature = listofparams[3]
#     print(chr_name)
    documents = document_creation_loc(dfs, chr_name, fileList)
    dictlist = documentPrepforVec(documents)
    vectorizer = tf_idfVect(dictlist, maxfeature)
    feature_ranges = Loc_labeling(vectorizer, chr_name)
    #     document_text = document_creation_word(feature_ranges, documents)
    return feature_ranges
#     corpus = pd.concat([corpus, feature_ranges])


In [11]:
def representationLearning(path, maxfeature, path_filelist, clas_type, numberofCores, path_representation):
    
    fileList = sorted(os.listdir(path_filelist))
    chrs = list(range(1, 23))
    chrs.append('X')
    chrs.append('Y')
    print(chrs)
    dfs = readingBedfiles(path)
    
    pool = Pool(numberofCores)
    print('maxNoOfFeatures', maxfeature)
    
    listOfparameters = [[dfs] * len(chrs), chrs, [fileList] * len(chrs), [maxfeature] * len(chrs)]

    #creates a list of vectors
    df_list = pool.map(tf_idf_chrom, list(map(list, zip(*listOfparameters))))# (file_list, numOfFiles, segmentation_df))
    print('2==========================================')
    corpus = pd.concat(df_list)
    print('3==========================================')
    corpus[['chrom', 'start', 'end']].to_csv(path_representation + '{}_{}_atlas.bed'.format(clas_type, maxfeature/100000), sep = '\t', index = False, header=False)
    return corpus[['chrom', 'start', 'end']]

# Test

In [12]:
path_filelist = './'
path = path_filelist + "datasets/antibodydataset/test/*"
clas_type = 'antibody'
path_representation = './representations/{}/'.format(clas_type)
numberofCores = 6  

In [1]:
%%time

for maxfeature in [100000, 500000, 1000000]:
    representationLearning(path, maxfeature, path_filelist, clas_type, numberofCores, path_representation)
#     print('maxNoOfFeatures', maxfeature)


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


In [33]:
# %%time
# corpus = pd.DataFrame()
# for chr_name in chrs:
#     documents = document_creation_loc(dfs, chr_name, fileList)
#     dictlist = documentPrepforVec(documents)
#     vectorizer = tf_idfVect(dictlist, maxfeature)
#     feature_ranges = Loc_labeling(vectorizer, chr_name)
# #     document_text = document_creation_word(feature_ranges, documents)
#     corpus = pd.concat([corpus, feature_ranges])
#     print('===========================')