-
Notifications
You must be signed in to change notification settings - Fork 0
/
proteinmultialign.rb
148 lines (145 loc) · 6.04 KB
/
proteinmultialign.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/ruby
=begin
Author: Gaurav Sablok
Universitat Potsdam
Date: 2024-4-5
a ruby class for analyzing the single and the multigff alignments
coming from the genome annotations. It has the support for both the
gff and the paf alignments. Also has the support for building the graphs
for the graphql.
=end
gem install bio
gem install parse_fasta
require 'bio'
require 'parse_fasta'
class SINGLEMultiGFF
def extractpafSingle(fastafile, gffalignment)
fasta = File.read(fastafile).readlines
sequenceids = []
sequencechar = []
ParseFasta::SeqFile.open(fasta).each_record do | iter |
sequenceids.push(iter.header.to_s)
sequencechar.push(iter.seq.to_s)
end
gffread = File.open(gffalignment).readlines
gffreadmain = gffread[2..gffread.length]
mRNAidlocals = []
mRNAid_start_positions = []
mRNAid_stop_position = []
mRNAgffids = []
for i in 0..gffreadmain.length
mRNAidlocals.push(gffreadmain[i].to_s.strip.split[2]) \
if gffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..gffreadmain.length
mRNAid_start_position.push(gffreadmain[i].to_s.strip.split[3]) \
if gffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..gffreadmain.length
mRNAid_stop_position.push(gffreadmain[i].to_s.strip.split[4]) \
if gffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..gffreadmain.length
mRNAgffids.push(gffreadmain[i].to_s.strip.split[0]) \
if gffread[i].to_s.strip.split[2] == "mRNA"
end
mRNAsequenceiter = {}
mRNAsequencesplice = {}
for i in 0..mRNAid_start_positions.length
mRNAsequenceiter[sequenceids[i]] = [mRNAid_start_positions[i], mRNAid_stop_positions]
end
for i in 0..mRNAid_start_positions.length
mRNAsequencesplice[[sequenceids[i],mRNAidslocal[i]] = sequencechar[i].to_s.slice(mRNAid_start_positons[i], mRNAid_end_positions[i]) \
if sequenceids[i] == mRNAgffids[i]
end
end
def extractpafMulti(fastafile, multigff, outputfasta)
fasta = File.read(fastafile).readlines
sequenceids = []
sequencechar = []
ParseFasta::SeqFile.open(fasta).each_record do | iter |
sequenceids.push(iter.header.to_s)
sequencechar.push(iter.seq.to_s)
end
multigffread = File.open(multigff, "r").readlines
multigffmRNA = []
multigffmRNAstart = []
multigffmRNAend = []
multigffseqids = []
for i in 0..multigffread.length
multigffmRNA.push(multigffread[i].to_s.strip.split[2]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffmRNAstart.push(multigffread[i].to_s.strip.split[3]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffmRNAend.push(multigffread[i].to_s.strip.split[4]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffseqids.push(multigffread[i].to_s.strip.split[0]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
multifastaiterinfo = {}
multifastaseqsplice = {}
for i in 0..multigffseqids.length
multifastaiterinfo[multigffseqids[i]] = [multigffmRNAstart[i], multigffmRNAend[i]]
end
for i in 0..multigffmRNAstart.length
multifastaseqsplice[sequenceids[i]] = sequencechar[i].slice(multigffmRNAstart[i].to_i, multigffmRNAend[i].to_i) \
if sequenceids[i].to_s == multigffseqids[i].to_s
end
end
def extractUpstreamDownstream(fastafile, multigff, upstream, downstream)
fasta = File.read(fastafile).readlines
alignment = File.read(gffalignment).readlines
sequenceids = []
sequencechar = []
ParseFasta::SeqFile.open(fasta).each_record do | iter |
sequenceids.push(iter.header.to_s)
sequencechar.push(iter.seq.to_s)
end
multigffread = File.open(multigff, "r").readlines
multigffmRNA = []
multigffmRNAstart = []
multigffmRNAend = []
multigffseqids = []
for i in 0..multigffread.length
multigffmRNA.push(multigffread[i].to_s.strip.split[2]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffmRNAstart.push(multigffread[i].to_s.strip.split[3]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffmRNAend.push(multigffread[i].to_s.strip.split[4]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
for i in 0..multigffread.length
multigffseqids.push(multigffread[i].to_s.strip.split[0]) \
if multigffread[i].to_s.strip.split[2] == "mRNA"
end
multifastaiterinfo = {}
multifastaseqsplice = {}
multifastaupstream = {}
multifastadownstream = {}
for i in 0..multigffseqids.length
multifastaiterinfo[multigffseqids[i]] = [multigffmRNAstart[i], multigffmRNAend[i]]
end
for i in 0..multigffmRNAstart.length
multifastaseqsplice[sequenceids[i]] = sequencechar[i].slice(multigffmRNAstart[i].to_i, multigffmRNAend[i].to_i) \
if sequenceids[i].to_s == multigffseqids[i].to_s
end
for i in 0..multigffmRNAstart.length
multifastaupstream[sequenceids[i]] = sequencechar[i].slice(multigffmRNAstart[i].to_i+upstream.to_i, multigffmRNAend[i].to_i) \
if sequenceids[i].to_s == multigffseqids[i].to_s
end
for i in 0..multigffmRNAstart.length
multifastadownstream[sequenceids[i]] = sequencechar[i].slice(multiffmRNAstart[i].to_i, multigffmRNAend[i].to_i+downstream.to_i) \
if sequenceids[i].to_s == multigffseqids[i].to_s
end
end
end