In [None]:
import sys
import os
sys.path.append(os.getcwd() + '/../python_scripts') # this lets us import files in python_scripts (like gtools)
import gtools
if os.getcwd()[:8] != '/scratch': # switch to the scratch directory where all the data files are
    os.chdir(f'/scratch/cam02551/{os.getcwd().split("/")[-2]}')

import pandas as pd
from tqdm import tqdm

# Convert reported mutations to my tsv format

In [None]:
genome = gtools.load_genome('data/ref/ref.fa')

### 1001 genomes
Data are from here: https://1001genomes.org/data/GMI-MPI/releases/v3.1/1001genomes_snp-short-indel_only_ACGTN.vcf.gz. Need to convert from VCF format to my tsv format, which includes separating out VCF records with more than one ALT and ALTs simultaneously representing SNVs and indels

In [None]:
fout = open('data/1001/1001_muts.tsv', 'w')
f = open('data/1001/1001genomes_snp-short-indel_only_ACGTN.vcf', 'r')

fout.write('chrom\tpos\tref\talt\n')
for l in tqdm(f, total=12883863):
    if l[0] == '#':
        continue
    
    l = l.strip('\n').split('\t')
    for alt in l[4].split(','):
        pos = int(l[1])
        ref = l[3]
        
        for i in range(min(len(ref), len(alt))): # iterate through positions in ref and alt
            if ref[i] != alt[i]: # if ref and alt don't match, output a snv
                p = pos - 1 + i
                r = ref[i]
                a = alt[i]
                fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
        if len(ref) > len(alt): # if there's a deletion, the extra bases on the end of ref are what's deleted
            p = pos - 1 + len(alt)
            r = ref[len(alt):]
            a = '*'
            fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
        elif len(ref) < len(alt): # if there's an insertion, the extra bases on the end of alt are what's inserted
            p = pos - 1 + len(ref)
            r = '*'
            a = alt[len(ref):]
            fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
f.close()
fout.close()

### 1001 genomes Ler-0 only

In [None]:
with open('data/1001/1001genomes_snp-short-indel_only_ACGTN.vcf', 'r') as f:
    for l in f:
        if l[0] == '#' and l[1] != '#':
            header = l[1:].split()
            break


In [None]:
ler_muts = []
f = open('data/1001/1001genomes_snp-short-indel_only_ACGTN.vcf', 'r')
for l in tqdm(f, total=12883863):
    if l[0] == '#':
        continue
    
    l = l.strip('\n').split('\t')
    ler_gt = l[header.index('7213')].split(':')[0] # get the genotype field of Ler-0
    if ler_gt[0] != '.' and ler_gt[0] != '0' and ler_gt[0] == ler_gt[2]: # if Ler-0 has an alt genotype call
        pos = int(l[1])
        ref = l[3]
        alt = l[4].split(',')[int(ler_gt[0]) - 1]
        
        for i in range(min(len(ref), len(alt))): # iterate through positions in ref and alt
            if ref[i] != alt[i]: # if ref and alt don't match, output a snv
                p = pos - 1 + i
                r = ref[i]
                a = alt[i]
                ler_muts.append(('Chr' + l[0], p, r, a))
        if len(ref) > len(alt): # if there's a deletion, the extra bases on the end of ref are what's deleted
            p = pos - 1 + len(alt)
            r = ref[len(alt):]
            a = '*'
            ler_muts.append(('Chr' + l[0], p, r, a))
        elif len(ref) < len(alt): # if there's an insertion, the extra bases on the end of alt are what's inserted
            p = pos - 1 + len(ref)
            r = '*'
            a = alt[len(ref):]
            ler_muts.append(('Chr' + l[0], p, r, a))

In [None]:
df_ler = pd.DataFrame(ler_muts, columns='chrom pos ref alt'.split())
df_ler

In [None]:
df_ler.to_csv('data/1001/1001_Ler-0_only.tsv', index=False, sep='\t')

### Weng 2018

these are the lines used in Ossowski 2010 and other papers from Detlef's lab. Shaw 2000 seems to be the original source. They started with 120 lines, but only 117 made it to G17 and only 107 are sequenced in Weng 2018 

mapped to TAIR10. confirmed 1 based

In [None]:
df_weng = pd.read_csv('data/variant/ma_lines/weng_raw_unique_mutations.csv')
df_weng = df_weng.dropna(how='all')
df_weng['chrom'] = df_weng.CHROM.apply(lambda x: f'Chr{str(x)[0]}')
df_weng['pos'] = df_weng.POS.apply(lambda x: int(x) - 1)
df_weng['pos'] += df_weng.apply(lambda r: 1 if len(r.REF) > 1 or len(r.ALT) > 1 else 0, axis=1)
df_weng['ref'] = df_weng.apply(lambda r: '*' if len(r.ALT) > 1 else r.REF, axis=1)
df_weng.ref = df_weng.ref.apply(lambda x: x[1:] if len(x) > 1 else x)
df_weng['alt'] = df_weng.apply(lambda r: '*' if len(r.REF) > 1 else r.ALT, axis=1)
df_weng.alt = df_weng.alt.apply(lambda x: x[1:] if len(x) > 1 else x)
df_weng['source'] = df_weng.MA_Line.apply(lambda x: f'weng_{int(x)}')
df_weng = df_weng['chrom pos ref alt source'.split()]

In [None]:
df_weng

In [None]:
srrs = 'SRR6750188	SRR6750189	SRR6750190	SRR6750191	SRR6750184	SRR6750185	SRR6750186	SRR6750187	SRR6750192	SRR6750193	SRR6750158	SRR6750159	SRR6750160	SRR6750161	SRR6750162	SRR6750163	SRR6750164	SRR6750165	SRR6750166	SRR6750167	SRR6750202	SRR6750201	SRR6750204	SRR6750203	SRR6750198	SRR6750197	SRR6750200	SRR6750199	SRR6750196	SRR6750195	SRR6750106	SRR6750107	SRR6750104	SRR6750105	SRR6750102	SRR6750103	SRR6750100	SRR6750101	SRR6750098	SRR6750099	SRR6750113	SRR6750112	SRR6750111	SRR6750110	SRR6750117	SRR6750116	SRR6750115	SRR6750114	SRR6750109	SRR6750108	SRR6750122	SRR6750123	SRR6750124	SRR6750125	SRR6750118	SRR6750119	SRR6750120	SRR6750121	SRR6750126	SRR6750127	SRR6750129	SRR6750128	SRR6750131	SRR6750130	SRR6750133	SRR6750132	SRR6750135	SRR6750134	SRR6750137	SRR6750136	SRR6750150	SRR6750151	SRR6750148	SRR6750149	SRR6750154	SRR6750155	SRR6750152	SRR6750153	SRR6750156	SRR6750157	SRR6750147	SRR6750146	SRR6750145	SRR6750144	SRR6750143	SRR6750142	SRR6750141	SRR6750140	SRR6750139	SRR6750138	SRR6750176	SRR6750177	SRR6750178	SRR6750179	SRR6750180	SRR6750181	SRR6750182	SRR6750183	SRR6750168	SRR6750194	SRR6750175	SRR6750174	SRR6750173	SRR6750172	SRR6750171	SRR6750170	SRR6750169'.split()
lines = '1	2	3	4	5	6	7	8	9	11	13	14	15	16	17	18	19	20	22	23	24	25	26	27	28	29	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	71	72	73	74	75	76	77	78	79	80	81	82	83	84	85	86	88	89	91	92	94	96	98	99	100	101	102	103	105	106	108	109	110	111	112	113	114	115	116	117	118	119'.split()

for i in range(len(srrs)):
    print(f'{srrs[i]}\tweng-ma\t{lines[i]}\t0')

In [None]:
df_weng.to_csv('data/variant/ma_lines/weng_reformatted_muts.tsv', sep='\t', index=False)

In [None]:
df_weng

In [None]:
# split each line into its own tsv
os.makedirs('data/variant/ma_lines/weng_split/', exist_ok=True)
for line in set(df_weng.source):
    df_weng[df_weng.source == line].to_csv(f'data/variant/ma_lines/weng_split/{line}_muts.tsv', sep='\t', index=False)

### Lu 2021

34034794

mapped to TAIR10. confirmed 1-based. not used in final analysis

line | A | B | C | D | E | F
-- | -- | -- | -- | -- | -- | -- 
Description | 23C population | 32C population | 28C population | 23C individuals | 32C individuals | 28C individuals

In [None]:
def mut_to_ref_alt(m):
    try:
        if m[0] == 'd':
            return (m.split()[1], '*')
        elif m[0] == 'i':
            if len(m.split()) > 1:
                return ('*', m.split()[1])
            else: # not sure why this is the case, probably a typo in their table
                return ('*', 'N')
        else:
            return (m[0], m[-1])
    except IndexError:
        print(m)

df_lu = pd.read_table('data/variant/ma_lines/lu_raw_mutations.tsv')
df_lu['chrom'] = df_lu.Position.apply(lambda x: x.split('_')[0])
df_lu['pos'] = df_lu.Position.apply(lambda x: int(x.split('_')[1]) - 1)
df_lu['ref'] = df_lu.Mutation.apply(lambda x: mut_to_ref_alt(x)[0])
df_lu['alt'] = df_lu.Mutation.apply(lambda x: mut_to_ref_alt(x)[1])
df_lu = df_lu[df_lu.Sample.apply(lambda s: s[0] == 'D')]
df_lu['source'] = df_lu.Sample.apply(lambda x: f'lu_{x}')
df_lu = df_lu['chrom pos ref alt source'.split()]
df_lu

In [None]:
# raw reads
srrs = '''
SRR9325733
SRR9325732
SRR9325731
SRR9325730
SRR9325737
SRR9325736
SRR9325735
SRR9325734
SRR9325739
SRR9325738
SRR9325715
SRR9325745
SRR9325746
SRR9325747
SRR9325740
SRR9325741
SRR9325714
SRR9325717
SRR9325716
SRR9325719
SRR9325718
SRR9325742
SRR9325743
SRR9325748
SRR9325749
SRR9325725
SRR9325721
SRR9325720
SRR9325723
SRR9325722
SRR9325744
SRR9325724
SRR9325729
SRR9325728
SRR9325727
SRR9325726
'''.split()
names = '''
G0-1
G0-2-1
G0-2-2
G0-2-3
G0-2-4
G0-2-5
A16L1
A16L2
A16L3
A16L4
A16L5
D10L1
D10L2
D10L3
D10L4
D10L5
B22L1
B22L2
B22L3
B22L4
B22L5
E10L1
E10L2
E10L3
E10L4
E10L5
C19L1
C19L2
C19L3
C19L4
C19L5
F10L1
F10L2
F10L3
F10L4
F10L5
'''.split()

for i in range(len(srrs)):
    print(f'{srrs[i]}\tlu-ma\t{names[i]}\t0')