In [None]:
#upload new jupyter and this notebook

In [5]:
import csv
import os
import glob
from natsort import natsorted
import gc

In [123]:
def translate(sequence): #TESTED
    """
    Translates a nucleotide sequence into an amino acid sequence.

    Arguments:
    sequence -- a string representing the nucleotide sequence

    Returns:
    A string representing the corresponding amino acid sequence
    """
    codon_table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    protein = ''
    if len(sequence) % 3 != 0:
        print("Warning: Sequence length is not a multiple of 3.")
    for i in range(0, len(sequence), 3):
        codon = sequence[i:i+3]
        if codon in codon_table:
            protein += codon_table[codon]
        else:
            protein += 'X'  # unknown amino acid
    return protein

In [126]:
#Codon frequencies based on Human table from GenScript https://www.genscript.com/tools/codon-frequency-table

#These have been double checked for accuracy 
most_frequent_codon = {"F": "TTC", "L": "CTG", "Y": "TAC", "_":"TGA", "H":"CAC", "Q":"CAG", "I":"ATC", "M":"ATG", 
                      "N":"AAC", "K":"AAG", "V":"GTG", "D":"GAC", "E":"GAG", "S":"AGC", "C":"TGC", "W":"TGG",
                       "P":"CCC", "R":"CGG", "T": "ACC", "A":"GCC", "G":"GGC"}

#None means there is only one codon
second_most_frequent_codon = {"F": "TTT", "L": "CTC", "Y": "TAT", "_":"TAA", "H":"CAT", "Q":"CAA", "I":"ATT", "M":None, 
                      "N":"AAT", "K":"AAA", "V":"GTC", "D":"GAT", "E":"GAA", "S":"TCC", "C":"TGT", "W":None,
                       "P":"CCT", "R":"AGA", "T": "ACA", "A":"GCT", "G":"GGG"}

In [179]:
def load_csv_to_list_of_dict(filename):
    """Loading the file that has been used as an input for PRIDICT as a list of dictionaries"""
    result = []
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            row['id'] = i+1
            result.append(row)
    return result

def append_csv_data(data, folder_path):
    """Function to append PRIDICT analysis to each designed pegRNA (we're takig only the first row)"""
    for d in data:
        id_value = d['id']
        filename = folder_path + '/' + str(id_value) + '_pegRNA_Pridict_full.csv'
        try:
            with open(filename, 'r') as f:
                reader = csv.DictReader(f)
                new_data = next(reader)
                d.update(new_data)
        except:
            print("Failed to open " + str(id_value))
            d.update({'PRIDICT_editing_Score_deep': -1.0}) #we put the score -1 which means this entry should be neglected (otherwise there is an error when searching for highest score)
    return data

def group_dict_by_keys(list_of_dicts, keep_only_frequent_codons = 0):
    """
    Takes a list of dictionaries and groups them by the first three keys.
    Returns a new list of dictionaries with the grouped dictionaries nested
    within their respective parent dictionaries.
    
    if keep_only_frequent_codons is 1, we keep only the otpimal codon for a mutation
    (the rest is not present in the output). For designing a synonymous change,
    in case the original codon is already optimal, we take the second most optimal.
    In case of W and M, we don't output anything (there are no possible mutations)
    """
    grouped_dict_list = []
    i = 0
    for d in list_of_dicts:
        values = list(d.values())
        key = (values[1], values[0], values[3]) #original aa, pos, mutated aa
        matching_dict = None
        
        for group in grouped_dict_list:
            group_key = (group['original_aa'], group['pos'], group['mutated_aa'])
            if group_key == key:
                matching_dict = group
                break
                
        if matching_dict is None:
            matching_dict = dict(zip(('original_aa', 'pos', 'mutated_aa'), key))
            grouped_dict_list.append(matching_dict)
            
        if(keep_only_frequent_codons == 0):
            matching_dict.setdefault('items', []).append(d)
        
        if(keep_only_frequent_codons == 1):
            if(key[0] != key[2]): #Non-synonynous change
                if(d['Mutated_nts'] == most_frequent_codon[d['Mutated_aa']]): #if the codon we're looking at is optimal
                    matching_dict.setdefault('items', []).append(d)         
                    
            elif(key[0] == key[2]): #Synonymous change
                if(d['Original_nts'] != most_frequent_codon[d['Original_aa']]): #if existing codon is not optimal
                    if(d['Mutated_nts'] == most_frequent_codon[d['Mutated_aa']]): #if we're looking at the optimal codon
                        matching_dict.setdefault('items', []).append(d)  
                        
                if(d['Original_nts'] == most_frequent_codon[d['Original_aa']]): #if existing codon is already optimal
                    if(d['Mutated_nts'] == second_most_frequent_codon[d['Mutated_aa']]): #if we're looking at the second best codon
                        matching_dict.setdefault('items', []).append(d)
        if(i%1000 == 0):
            print(i)
        i += 1
    return grouped_dict_list


def find_highest_score(scores):
    """Finds the dictionary with the highest 'PRIDICT_editing_Score_deep' value' among all dictionaries with the same (original_aa, position, mutated_aa)"""
    highest_score = None
    
    for score in scores:
        if highest_score is None or float(score['PRIDICT_editing_Score_deep']) > float(highest_score['PRIDICT_editing_Score_deep']):
            highest_score = score
            
    return highest_score

def print_highest_score_values(data, output_file):
    # Extract the header keys from the first nested dictionary
    items_keys = list(data[0]['items'][0].keys())

    # Open the output file and create a CSV writer
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)

        # Write the header row
        writer.writerow(items_keys)

        # Loop over the dictionaries in the list
        for i,d in enumerate(data):
            print(i)
            # Find the dictionary with the highest 'score' value
            max_score_dict = find_highest_score(d['items'])

            # Write its values to the CSV file
            writer.writerow(max_score_dict.values())


In [176]:
data = load_csv_to_list_of_dict('For_Minja_all.csv')
data = append_csv_data(data, 'predictions')

Failed to open 10335
Failed to open 10357
Failed to open 10379
Failed to open 13688
Failed to open 14847
Failed to open 14872
Failed to open 17543
Failed to open 17559
Failed to open 17574
Failed to open 26797
Failed to open 33447
Failed to open 33448
Failed to open 33450
Failed to open 33451
Failed to open 33470
Failed to open 33471
Failed to open 33478
Failed to open 33479
Failed to open 33486
Failed to open 33487
Failed to open 33499
Failed to open 33500
Failed to open 33502
Failed to open 33503
Failed to open 33527
Failed to open 33528
Failed to open 33530
Failed to open 33531
Failed to open 33854
Failed to open 34060
Failed to open 34061
Failed to open 34121
Failed to open 34188
Failed to open 34189
Failed to open 37558
Failed to open 37583
Failed to open 38158
Failed to open 38439
Failed to open 38505
Failed to open 38609
Failed to open 38617
Failed to open 38625
Failed to open 38651
Failed to open 38942
Failed to open 46892
Failed to open 46948
Failed to open 47781
Failed to ope

In [177]:
grouped_dict_list = group_dict_by_keys(data, keep_only_frequent_codons = 1)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000


In [180]:
print_highest_score_values(grouped_dict_list, 'Final_output_for_Minja.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27