# Import Packages 

In [3]:
import numpy as np
import re
import pandas as pd
import os
import glob
import pickle 

# Load sequences

In [4]:
def read_fasta(FASTA):
    '''
    Read fasta file 
    
    input: fasta file
    
    output: 
         - list of sequence  
         - list of sequence names
    '''
    
    m_file=open(FASTA,'r')

    data=''
    name_list=[]
    seq_list=[]

    for line in m_file:
        line=line.strip()
        for i in line:
            if i=='>':
                name_list.append(line[1:])
                if data :
                    seq_list.append(data)
                    data=''

                break
            else:
                line=line.upper()
        if all([k==k.upper() for k in line]):
            data=data+line
    if data:
        data = " ".join(data)
        seq_list.append(data)
        
    return seq_list, name_list


## Random sequences 

In [3]:
# Read 30% identity random seqeunce fasta file 
rand_fasta_file = "/home/defense/shokor/mapping2/data/id_30/randclust_rep_seq.fasta"
rand_seq_list, rand_name_list = read_fasta(rand_fasta_file) 
len(rand_name_list)

In [5]:
# Remove sequences found it in uniprot/swissprot
rand_out_seq_name = ['seq11648', 'seq1885', 'seq24400', 'seq19982', 'seq39583', 'seq22586', 'seq19626']

# save the names of sequence index
rand_drop_out = []
for i, s in enumerate(rand_name_list):
    if s in rand_out_seq_name:
        rand_drop_out.append(i)
#  Remove sequences      
for index in sorted(rand_drop_out, reverse=True):
    del rand_seq_list[index]
    del rand_name_list[index]
    
len(rand_seq_list)

49993

In [6]:
def generate_df(seq_list, name_list, seq_type):
    '''
    Generate a dataframe of : ID, Gene, Sequence, Description, Organism and Types
    
    input:
        - list of sequence
        - list of sequence names
        - sequence type: Random, Expected
        
    output:
        - Dataframe
    '''
    # for missing data
    unknown = ['None']*len(name_list)
    df = pd.DataFrame(
        {'ID': name_list,
         'Gene' : unknown,
         'Sequence': seq_list,
         'Description' : unknown,
         'Organism' : unknown,
         'Types': [seq_type for i in range(len(name_list))] 
        })
    return df

In [7]:
rand_df = generate_df(rand_seq_list, rand_name_list, 'Random')

In [8]:
# save dataframe
rand_df.to_csv('/home/defense/shokor/mapping2/data/random.csv')

In [9]:
rand_df = pd.read_csv("/home/defense/shokor/mapping2/data/random.csv")

In [10]:
rand_df

Unnamed: 0.1,Unnamed: 0,ID,Gene,Sequence,Description,Organism,Types
0,0,seq2901,,P G I P M D Y L P A P T Y S P R N C P A H E S ...,,,Random
1,1,seq2902,,V D N G Q V K S I C S Q V I M V L D N Q K R L ...,,,Random
2,2,seq2903,,F P I G F R T Y S I M Y T Y T S I R H G C R I ...,,,Random
3,3,seq2904,,A C E E Q S T W L D E L K F I S C V F F N P I ...,,,Random
4,4,seq2905,,Y D H M M I H E A I P A F K S C R R H H V A G ...,,,Random
...,...,...,...,...,...,...,...
49988,49988,seq38996,,V M N H Q Q Q P A I C I N D L T Q V V F R T D ...,,,Random
49989,49989,seq38997,,H C E N H I C G G L I S T G M A M P L Q D L Q ...,,,Random
49990,49990,seq38998,,G E S F H M D V G R H R H W S Y P L M H C P A ...,,,Random
49991,49991,seq38999,,N G L C Y L K Q C R L D H I M K K I P C W P R ...,,,Random


## Mutated sequences

In [1]:
mutate_fasta_file = "/home/defense/shokor/mapping2/data/id_30/mutateclust_rep_seq.fasta"

In [5]:
mutate_seq_list, mutate_name_list = read_fasta(mutate_fasta_file) 
len(mutate_seq_list)

9283

In [6]:
# create dataframe of randomly mutated sequence
unknown = ['None']*len(mutate_name_list)
mut_df = pd.DataFrame(
    {'ID': mutate_name_list,
     'Gene' : unknown,
     'Sequence': mutate_seq_list,
     'Description' : unknown,
     'Organism' : unknown,
     'Types': [i[-9:] for i in mutate_name_list]
    })

In [7]:
mut_df

Unnamed: 0,ID,Gene,Sequence,Description,Organism,Types
0,P80566 mutate_99,,M R F G V K V P V M K G D A P K E C L I H F Y ...,,,mutate_99
1,O09164 mutate_99,,M N R F F F W G L M N A N C G S P T N A N K K ...,,,mutate_99
2,Q9FK60 mutate_99,,L F G P A Q I L K A L A V A A G D Y C E N G Y ...,,,mutate_99
3,P00443 mutate_99,,L M L K A Y N K L Q G D S H V K F F I H F E A ...,,,mutate_99
4,P10791 mutate_99,,M H V K A V L V D N L Q W R G M M V F M K I G ...,,,mutate_99
...,...,...,...,...,...,...
9278,Q9MAB6 mutate_10,,M M M L R Q T S R K A Y L G L Q A S P A G L G ...,,,mutate_10
9279,Q5SKZ7 mutate_10,,M S A S S E R E L Y E A W V E L F S W M R E Y ...,,,mutate_10
9280,P40207 mutate_10,,M S L K D R Y L T L E L K L P N K L Q E L Y Y ...,,,mutate_10
9281,P37193 mutate_10,,M F C L G L R R S A V H Q S C K L I S K Q I A ...,,,mutate_10


In [35]:
# save dataframe
mut_df.to_csv('/home/defense/shokor/mapping2/data/mutate.csv')

## Blosum mutate

In [None]:
blos_mutate_fasta_file = "/home/defense/shokor/mapping2/data/id_30/blosum62_mutate_real_seq.fasta"

In [None]:
mutate_seq_list, mutate_name_list = read_fasta(mutate_fasta_file) 
len(mutate_seq_list)

In [None]:
# create dataframe of blossum-based mutated sequence
unknown = ['None']*len(mutate_name_list)
blos_df = pd.DataFrame(
    {'ID': mutate_name_list,
     'Gene' : unknown,
     'Sequence': mutate_seq_list,
     'Description' : unknown,
     'Organism' : unknown,
     'Types': [i[-16:] for i in mutate_name_list]
    })

In [None]:
blos_df

In [None]:
# save dataframe
blos_df.to_csv('/home/defense/shokor/mapping2/data/blosum_mutate.csv')

## Expected sequences

In [11]:
expected_fasta_file = "/home/defense/shokor/mapping2/data/expected_seq.fasta"

In [12]:
expected_seq_list, expected_name_list = read_fasta(expected_fasta_file) 
len(expected_seq_list)

50000

In [13]:
# Remove sequences found it in uniprot/swissprot
exp_out_seq_name = ['expected_seq33114', 'expected_seq16872', 'expected_seq38982', 'expected_seq42840']

exp_drop_out = []
for i, s in enumerate(expected_name_list):
    if s in exp_out_seq_name:
        exp_drop_out.append(i)
        
for index in sorted(exp_drop_out, reverse=True):
    del expected_seq_list[index]
    del expected_name_list[index]
    
len(expected_name_list)

49996

In [14]:
exp_df = generate_df(expected_seq_list, expected_name_list, 'Expected')

In [16]:
exp_df

Unnamed: 0,ID,Gene,Sequence,Description,Organism,Types
0,expected_seq1,,V V H V D L R Y Q R W S F R F L Y S E R P N V ...,,,Expected
1,expected_seq2,,Q E V G F E E A L P V T S I R I F Q G V R N F ...,,,Expected
2,expected_seq3,,E K E H K G L Y T G L L F M P K A L K V S A L ...,,,Expected
3,expected_seq4,,P I P M V I D G I H V A P A T D L D K R T N P ...,,,Expected
4,expected_seq5,,A P R Q Q E I Q N L S L G S C T Q R V E G L T ...,,,Expected
...,...,...,...,...,...,...
49991,expected_seq49996,,N D I F H L D F D T R R D K P P E R S R C D S ...,,,Expected
49992,expected_seq49997,,S F D G L A A L T A F L I R S G N D L L I D K ...,,,Expected
49993,expected_seq49998,,K Q V E G S T S I A L R V V T K M D C E D E K ...,,,Expected
49994,expected_seq49999,,E E T T D V V L Y E G L T R T Q E L V D V Y G ...,,,Expected


In [15]:
# save dataframe
exp_df.to_csv('/home/defense/shokor/mapping2/data/expected.csv')

## Real sequences

Real sequence file is so large, for that we will save the names and the sequences seperatly, than generate a dataframe with: 'ID', 'Gene', 'Description', 'Organism', 'Types', and in a second time we will add the sequences

In [3]:
real_fasta_file = "/home/defense/shokor/mapping2/data/uniclust30_2018_08/uniclust30_2018_08_seed.fasta"

In [None]:
real_seq_list, real_name_list = read_fasta(real_fasta_file) 

In [8]:
# save sequence (30% identity and length <= 256) in a file
with open("data/id_30/real_seq_list_uniclust", 'wb') as f: 
     pickle.dump(real_seq_list, f)

In [2]:
# read sequence list
with open('data/id_30/real_seq_list_uniclust', 'rb') as f: 
    real_seq_list = pickle.load(f) 

In [9]:
# save sequence names in a file
with open("data/id_30/real_name_uniclust", 'wb') as f: 
     pickle.dump(real_name, f)

In [2]:
# read names list
with open('data/id_30/real_name_uniclust', 'rb') as f: 
    real_name = pickle.load(f) 

In [3]:
len(real_seq_list)

15161832

In [4]:
# create id, organism, gene and description list 
ids = []
OS = []
gene = []
desc = []

for n in real_name:
    ids.append(re.findall(r'\|(.*?)\|', n))
    OS.append(re.findall(r"OS *[^\w ] *(.*) OX",n))
    gene.append(re.findall(r"GN *[^\w ] *(.*) PE",n))
    k = n.split('_')[1]
    d = re.findall(r"\w+(?=.* OS)",k)[1:]
    desc.append(" ".join(str(x) for x in d))
    

real_name_list = [item[0] for item in ids]
OS_real_list = [item for s in OS for item in s]
gene_real_list = [item for s in gene for item in s]

# for the missing gene names
if len(gene_real_list) < len(real_name):
    missing = len(real_name) - len(gene_real_list)
    unk = ['None']*missing 
    gene_real_list = gene_real_list + unk
    
#print(real_name_list)
#len(real_seq_list)

In [5]:
len(gene_real_list)

15161832

In [22]:
# Create a None list for missing values
unknown = ['None']*len(arti_seq_list)
#len(unknown)

In [37]:
# Create a Dataframe
df = pd.DataFrame(
    {'ID': real_name_list + arti_name_list,
     'Gene' : gene_real_list + unknown,
     #'Sequences': real_seq_list + arti_seq_list,
     'Description' : desc + unknown,
     'Organism' : OS_real_list + unknown,
     'Types': ["Real" for i in range(len(real_name_list))] + ["Artificial" for i in range(len(arti_name_list))] 
    })
df

Unnamed: 0,ID,Gene,Description,Organism,Types
0,A0A1Q6BRU5,AUO95_00920,Uncharacterized protein,Corynebacterium glutamicum,Real
1,A0A0A9U2E6,SAMN06296036_10138,Uncharacterized protein,Arundo donax,Real
2,A0A1Y6B8R2,ED92_14080,L ring protein,Pseudobacteriovorax antillogorgiicola,Real
3,A0A094PJX3,CL693_16430,Uncharacterized protein,Amycolatopsis sp. MJM2582,Real
4,A0A0P6A7Z7,LR48_Vigan04g171000,Uncharacterized protein,Daphnia magna,Real
...,...,...,...,...,...
15162827,seq196,,,,Artificial
15162828,seq197,,,,Artificial
15162829,seq198,,,,Artificial
15162830,seq199,,,,Artificial


In [7]:
# add sequence column
df['Sequence'] = real_seq_list

In [None]:
# Reorder columns name
df = df[['ID', 'Gene', 'Sequence', 'Description', 'Organism', 'Types']]

In [5]:
df

Unnamed: 0,ID,Gene,Description,Organism,Types,Sequence
0,A0A1Q6BRU5,AUO95_00920,Uncharacterized protein,Corynebacterium glutamicum,Real,M A K I T V F S P V K N F S G V S V G I T F S ...
1,A0A0A9U2E6,SAMN06296036_10138,Uncharacterized protein,Arundo donax,Real,M L D A L S I F S K V S L G L V V V R E P L G ...
2,A0A1Y6B8R2,ED92_14080,L ring protein,Pseudobacteriovorax antillogorgiicola,Real,M K L Q V V I I L S S F L V T G C Q T R V V P ...
3,A0A094PJX3,CL693_16430,Uncharacterized protein,Amycolatopsis sp. MJM2582,Real,M S S A R R T L E G V D N G I W P A F P V T R ...
4,A0A0P6A7Z7,LR48_Vigan04g171000,Uncharacterized protein,Daphnia magna,Real,M S P M C T S L C V R I S S T F A F P L A S S ...
...,...,...,...,...,...,...
15161827,A0A0F9WLH2,,Uncharacterized protein,Nosema ceranae,Real,M M T N E E I Q N L Y F Q V K L S K Q R R K C ...
15161828,A0A2E7FA34,,Uncharacterized protein,Rhodospirillaceae bacterium,Real,M L T S F H Y E V G L I L L I A P H V F G A P ...
15161829,D8N947,,Uncharacterized protein,Ralstonia solanacearum CMR15,Real,M S W M R H C R V A S W T W G V A S V V G P S ...
15161830,A0A1P8LWX1,,Uncharacterized protein,Halobiforma lacisalsi AJ5,Real,M S A T Q Q S F G M S Y D V T D T S L K T A F ...


In [15]:
# for missing description 
missing_desc = df.loc[df['Description'] == ''].index.tolist()
for i in missing_desc:
    df['Description'][i] = 'None'

In [7]:
# Drop out sequence with a length higher then 1000 (2000 after adding spaces)

drop_out = []

sequence = df['Sequence'].to_list()

for i, s in enumerate(sequence):
    if len(s)>511:
        drop_out.append(i)

df = df.drop(df.index[drop_out])

In [8]:
# Reindex dataframe 
df = df.reset_index(drop=True)

In [16]:
df

Unnamed: 0,ID,Gene,Description,Organism,Types,Sequence
0,A0A1Q6BRU5,AUO95_00920,Uncharacterized protein,Corynebacterium glutamicum,Real,M A K I T V F S P V K N F S G V S V G I T F S ...
1,A0A0A9U2E6,SAMN06296036_10138,Uncharacterized protein,Arundo donax,Real,M L D A L S I F S K V S L G L V V V R E P L G ...
2,A0A094PJX3,CL693_16430,Uncharacterized protein,Amycolatopsis sp. MJM2582,Real,M S S A R R T L E G V D N G I W P A F P V T R ...
3,A0A0P6A7Z7,LR48_Vigan04g171000,Uncharacterized protein,Daphnia magna,Real,M S P M C T S L C V R I S S T F A F P L A S S ...
4,A0A2E2XLC2,DLM60_08525,Uncharacterized protein Fragment,Cellvibrionaceae bacterium,Real,S K L C F D N V E N H P L L G T I Q K N Q L I ...
...,...,...,...,...,...,...
10578495,A0A0F9WLH2,,Uncharacterized protein,Nosema ceranae,Real,M M T N E E I Q N L Y F Q V K L S K Q R R K C ...
10578496,A0A2E7FA34,,Uncharacterized protein,Rhodospirillaceae bacterium,Real,M L T S F H Y E V G L I L L I A P H V F G A P ...
10578497,D8N947,,Uncharacterized protein,Ralstonia solanacearum CMR15,Real,M S W M R H C R V A S W T W G V A S V V G P S ...
10578498,A0A1P8LWX1,,Uncharacterized protein,Halobiforma lacisalsi AJ5,Real,M S A T Q Q S F G M S Y D V T D T S L K T A F ...


In [35]:
# save dataframe
df.to_csv('/home/defense/shokor/mapping2/data/real_df.csv')

In [36]:
df = pd.read_csv("/home/defense/shokor/mapping2/data/real_df.csv").iloc[: , 1:]

In [38]:
df

Unnamed: 0,ID,Gene,Sequence,Description,Organism,Types
0,A0A1Q6BRU5,AUO95_00920,M A K I T V F S P V K N F S G V S V G I T F S ...,Uncharacterized protein,Corynebacterium glutamicum,Real
1,A0A0A9U2E6,SAMN06296036_10138,M L D A L S I F S K V S L G L V V V R E P L G ...,Uncharacterized protein,Arundo donax,Real
2,A0A094PJX3,CL693_16430,M S S A R R T L E G V D N G I W P A F P V T R ...,Uncharacterized protein,Amycolatopsis sp. MJM2582,Real
3,A0A0P6A7Z7,LR48_Vigan04g171000,M S P M C T S L C V R I S S T F A F P L A S S ...,Uncharacterized protein,Daphnia magna,Real
4,A0A2E2XLC2,DLM60_08525,S K L C F D N V E N H P L L G T I Q K N Q L I ...,Uncharacterized protein Fragment,Cellvibrionaceae bacterium,Real
...,...,...,...,...,...,...
10578495,A0A0F9WLH2,,M M T N E E I Q N L Y F Q V K L S K Q R R K C ...,Uncharacterized protein,Nosema ceranae,Real
10578496,A0A2E7FA34,,M L T S F H Y E V G L I L L I A P H V F G A P ...,Uncharacterized protein,Rhodospirillaceae bacterium,Real
10578497,D8N947,,M S W M R H C R V A S W T W G V A S V V G P S ...,Uncharacterized protein,Ralstonia solanacearum CMR15,Real
10578498,A0A1P8LWX1,,M S A T Q Q S F G M S Y D V T D T S L K T A F ...,Uncharacterized protein,Halobiforma lacisalsi AJ5,Real
