In [5]:
import re
import time
from gensim import corpora
from gensim.models import Word2Vec
debug = False

In [6]:
# finds overlapping regions - but assumes each token is in order
# thus the start of token 2 will always be after the start of token 1
def find_overlaps(tokens):
    # List to store overlapping intervals
    overlaps = []
    
    # Iterate through the sorted list and check for overlaps
    for i in range(1, len(tokens)):
        i1, w1, s1, e1 = tokens[i - 1]
        i2, w2, s2, e2 = tokens[i]
        
        # scenario 1 : start of tokem 2 is before the end of token 1
        if s2 <= e1:
            overlaps.append((tokens[i - 1], tokens[i]))
        # scenario 2 : start 1 is before the end of token 1
        #if s2 <= e1:
        #    overlaps.append((tokens[i - 1], tokens[i]))
    return overlaps


# finds overlapping regions - and removes them, assumes the tokens
# are in order  - thus the start of token 2 will always be after the start of token 1
def remove_overlaps(tokens):
    # List to store overlapping intervals
    result = []
    prev_start, prev_end = None, None
    
    for token in tokens:
        start, end = token[2], token[3]
        
        # start with first token
        if prev_start is None:
            prev_start, prev_end = start, end
            result.append(token)
        # if there is overlap - don't add this item (for now)    
        else:
            if start <= prev_end:
                #print('--------------> overlap : ', start , end)
                # Overlapping interval found, skip adding this item
                continue
            else:
                # No overlap, add the item to the result list
                result.append(token)
                prev_start, prev_end = start, end
    return result

In [8]:
def create_corpus():
    input_file      = "/Users/patrick/dev/ucl/comp0158_mscproject/data/corpus/pre_corpus_20240715_1130.dat"
    corpus = []
    
    PARSE_LIMIT  = 1000000  # number of lines to parse
    DEBUG_LIMIT  = 100000   # number of lines after which to print a debug message
    
    start_time      = time.time()
    mid_time_start  = time.time()
    
    corpus = []
    
    # parse all lines in the pre-corpus (each represents a protein) and build up
    # the tokens for each - reminv overlaps
    with open(input_file, 'r') as input:
        for line_number, line in enumerate(input): # one line number per protein
            
            # protein details, pfam tokens and disordered tokens are separated by |
            # within that are the start and end poisitions
            # e.g. A0A010PZP8:1:633|PF00172:16:53|PF04082:216:322|DISORDER:50:103:50:109:553:598
            # 1 protein, 2 pfam tokens, 3 disordered regions
            
            cols  = line.split('|')
            token_idx = 0
            
            if(debug): print('\nline >', line.strip('\n'), '<')
            #print(len(cols), '> entries')
            
            # tokens for the current line
            tokens = []

            # each col is a section - either being the uniptor part, pfam or disoreded reginos
            for col in cols:
                col = col.rstrip("\n\s\t")
                # just in case
                if col == None or col == "":
                    continue
                # process a PFAM token
                if col.startswith('PF'):
                    pf_cols = col.split(':')
                    pf_token = pf_cols[0]
                    for pf in range(1, len(pf_cols)-1,2):
                        #print('PFM:', token_idx, ':', pf_token, 'start:', pf_cols[pf],'end:', pf_cols[pf + 1])
                        tuple = (token_idx, pf_token, int(pf_cols[pf]), int(pf_cols[pf + 1]))
                        tokens.append(tuple)
                        token_idx += 1
                # process a 'disordered' token
                elif col.startswith('DIS'):
                    dis_cols = col.split(':')
                    for dis in range(1, len(dis_cols)-1,2):
                        #print('DIS:', token_idx, ': start:', dis_cols[dis],'end:', dis_cols[dis + 1])
                        tuple = (token_idx, 'DISORDER', int(dis_cols[dis]), int(dis_cols[dis+1]))
                        tokens.append(tuple)
                        token_idx += 1
                # just printing out the token if needed
                else:
                    protein_cols = col.split(':')
                    #print('PROT:', protein_cols[0], 'start:', protein_cols[1], 'end:', protein_cols[2])
            #print('tokens:', tokens)
            
            # sort the tokens by start point (second item)
            sorted_tokens = sorted(tokens, key=lambda x: x[2])
            sorted_tokens_no_overlap = remove_overlaps(sorted_tokens)
            
            if(debug): 
                print('unsorted', tokens)
                print('sorted:', sorted_tokens)
                print('no overlaps',sorted_tokens_no_overlap)
            
            sentence = []
            for token in sorted_tokens_no_overlap:
                sentence.append(token[1])
                sentence.append('GAP')
            if(debug): print('final sentence:', sentence)
            
            # add to corpus
            if(len(sentence) != 0):
                corpus.append(sentence)
            
            # this just prints a progress message
            if (line_number % DEBUG_LIMIT == 0):
                mid_time_end = time.time()
                exec_time = mid_time_end - mid_time_start
                mid_time_start = mid_time_end
                print(line_number, 'lines processed in', round(mid_time_end - start_time,2))
            
            # drops out if we only want to process a number of files
            if(PARSE_LIMIT != -1):            
                if(line_number == PARSE_LIMIT):
                    end_time = time.time()
                    tot_time = end_time - start_time
                    print(PARSE_LIMIT, 'lines processed, terminating....')
                    return corpus
    return corpus

  col = col.rstrip("\n\s\t")


In [9]:
corpus = []
corpus = create_corpus()

0 lines processed in 0.0
100000 lines processed in 0.37
200000 lines processed in 0.84
300000 lines processed in 1.28
400000 lines processed in 1.81
500000 lines processed in 2.25
600000 lines processed in 2.82
700000 lines processed in 3.33
800000 lines processed in 3.77
900000 lines processed in 4.3
1000000 lines processed in 4.75
1000000 lines processed, terminating....


In [None]:
print("\n***** CORPUS *****:\n",corpus,'\n')

In [12]:
w2v = Word2Vec(corpus, vector_size=100, window=5, workers=4, min_count=5)

In [18]:
#words = list(w2v.wv.vocab)
print('"DISORDER" VECTOR:\n', w2v.wv['DISORDER'])
print('\n"PF00250" VECTOR:\n', w2v.wv['PF00250'])

"DISORDER" VECTOR:
 [-0.35966924  0.02456524  0.05076128 -0.14754885 -0.0668508  -0.3754027
 -0.04968289  0.09947164 -0.03055772 -0.10764594  0.71828926  0.21175925
 -0.15042204 -0.19736947 -0.2784051   0.5424766   0.18962252 -0.43741164
 -0.18036567 -0.58696336  0.6699122   0.31809402  1.0590287  -0.11786786
  0.34235743 -0.5437084   0.3865804   0.05919965 -0.8444295   0.0104759
  0.10678057  0.19904487  0.885017   -0.6592032  -0.3831204  -0.19140661
  0.05293044  0.23828791  0.21361044  0.40122643 -0.32491246  0.4420818
 -0.4419638  -0.559211   -0.16986938  0.62000066 -0.4115376   0.9462987
  0.18533881  0.12894966  0.19906375  0.11780615 -0.35628316  0.10660051
  0.3221957  -0.12932001  0.5308065   0.12141352 -0.5506251   0.2961019
 -0.12147589  0.15391962 -0.3452807  -0.5733853   0.35994604  0.33842188
 -0.00676064 -0.46636474 -0.00536083  0.52723366 -0.25583145 -0.1761985
 -0.33719715 -0.6141483  -0.23763172 -0.5642704   0.50907147 -0.4869721
  0.4697093   0.47759333  0.11578534  

In [None]:
w2v = Word2Vec()