-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_mol_stats.py
executable file
·152 lines (145 loc) · 5.42 KB
/
make_mol_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/python
import sys
import os
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from collections import Counter
#get the alignment file
align = sys.argv[1]
#get prefix
prefix = Path(align).stem
#create iterable container of SeqRecords
f = SeqIO.parse(align, 'fasta')
#create mol_stats directory
os.system("mkdir -p mol_stats")
#initializes sequence length
seq_length = 0
#codon table
codon_table = {
'A': ('GCT', 'GCC', 'GCA', 'GCG'),
'C': ('TGT', 'TGC'),
'D': ('GAT', 'GAC'),
'E': ('GAA', 'GAG'),
'F': ('TTT', 'TTC'),
'G': ('GGT', 'GGC', 'GGA', 'GGG'),
'H': ('CAT', 'CAC'),
'I': ('ATT', 'ATC', 'ATA'),
'K': ('AAA', 'AAG'),
'L': ('TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'),
'M': ('ATG',),
'N': ('AAT', 'AAC'),
'P': ('CCT', 'CCC', 'CCA', 'CCG'),
'Q': ('CAA', 'CAG'),
'R': ('CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
'S': ('TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'),
'T': ('ACT', 'ACC', 'ACA', 'ACG'),
'V': ('GTT', 'GTC', 'GTA', 'GTG'),
'W': ('TGG',),
'Y': ('TAT', 'TAC')
}
#compute the GC-richness for each sequence
with open("./mol_stats/"+prefix+".seq_GC.csv", 'w') as file:
for row in f:
file.write(row.id+","+str(GC(row.seq))+'\n')
seq_length = len(row.seq)
print("Generated " +prefix+ ".seq_GC.csv")
#compute the GC-richness for each site
with open("./mol_stats/"+prefix+".site_GC.csv", 'w') as file:
#iterates over each site
for x in range(seq_length):
site = ""
f = SeqIO.parse(align, 'fasta')
#iterates over each sequence
for row in f:
#adds the base from each site of the sequences
site += row.seq[x]
file.write(str(x)+","+str(GC(Seq(site)))+'\n')
print("Generated " +prefix+ ".site_GC.csv")
#determine if each site is phylogenetically informative
with open("./mol_stats/"+prefix+".site_phylo_inf.csv", 'w') as file:
#iterates over each site
for x in range(seq_length):
site = ""
counter = 0
informative = False
f = SeqIO.parse(align, 'fasta')
#iterates over each sequence
for row in f:
site += row.seq[x]
#checks if any site has more than 2 occurrences
if Seq(site).count('A') >= 2:
counter += 1
if Seq(site).count('C') >= 2:
counter += 1
if Seq(site).count('T') >= 2:
counter += 1
if Seq(site).count('G') >= 2:
counter += 1
#checks if there are more than 2 bases that have more than 2 occurrences
if counter >= 2:
#sets informative boolean to true
informative = True
file.write(str(x)+","+str(informative)+'\n')
print("Generated " +prefix+ ".site_phylo_inf.csv")
#determine the codon frequencies for each sequence
with open("./mol_stats/"+prefix+".seq_codon.csv", 'w') as file:
f = SeqIO.parse(align, 'fasta')
#iterates over ever sequence
for row in f:
#gets a list of codons from the sequence
codons = [str(row.seq[i:i+3]) for i in range(0, len(row.seq), 3)]
#counts the number of occurences of each codon
c = Counter(codons)
#iterates through the Counter and adds the frequency to the csv
for k,v in c.items():
#only add valid codons
if "-" not in k:
file.write(row.id + "," + k + "," + str(v/len(codons)) + '\n')
print("Generated " +prefix+ ".seq_codon.csv")
#compute codon frequencies for each site
with open("./mol_stats/"+prefix+".site_codon.csv", 'w') as file:
#iterates through codon sites
for x in range(0, seq_length, 3):
site_codon = []
f = SeqIO.parse(align, 'fasta')
#iterates through every seq
for row in f:
#add the codon to the site_codon list
site_codon.append(str(row.seq[x:x+3]))
#counts the number of occurences of each codon
c = Counter(site_codon)
#iterates through the Counter and adds the frequency to the csv
for k,v in c.items():
#only add valid codons
if "-" not in k:
file.write(str(int((x+3)/3)) + "," + k + "," + str(v/len(site_codon)) + '\n')
print("Generated " +prefix+ ".site_codon.csv")
#Compute biased codon usage proportions across amino acids, sites, and sequences
with open("./mol_stats/"+prefix+".codon_usage.csv", 'w') as file:
f = SeqIO.parse(align, 'fasta')
#iterates through every sequence
for row in f:
#gets every codon from the sequence
codons = [str(row.seq[i:i+3]) for i in range(0, len(row.seq), 3)]
keys = []
#iterates through every codon
for c in codons:
#translates the codons to amino acids
keys += [key for key, value in codon_table.items() if c in value]
#creates a list of amino acids without duplicates as keys
dkeys = list(set(keys))
d = {}
#iterates through every amino acid
for aa in dkeys:
freq = {}
#iterates through codon table
for codon in codon_table[aa]:
#gets the frequency for each codon
freq[codon] = codons.count(codon)/keys.count(aa)
d[aa] = freq
#itreates through dictionary
for k,v in d.items():
file.write(row.id+"," + k+","+str(v)+'\n')
print("Generated " +prefix+ ".codon_usage.csv")