In [None]:
import sys
import os
sys.path.append(os.getcwd() + '/../python_scripts') # this lets us import files in python_scripts (like gtools)
import gtools
if os.getcwd()[:8] != '/scratch': # switch to the directory where all the data files are
    os.chdir(f'/scratch/cam02551/{os.getcwd().split("/")[-2]}')

import pandas as pd
from tqdm import tqdm

# Convert reported mutations to my tsv format

In [None]:
genome = gtools.load_genome('data/ref/ref.fa') # created from snakemake pipeline

### 1001 genomes
Data are from here: https://1001genomes.org/data/GMI-MPI/releases/v3.1/1001genomes_snp-short-indel_only_ACGTN.vcf.gz. Need to convert from VCF format to my tsv format, which includes separating out VCF records with more than one ALT and ALTs simultaneously representing SNVs and indels

In [None]:
fout = open('data/variant/1001_muts.tsv', 'w')
f = open('data/variant/1001genomes_snp-short-indel_only_ACGTN.vcf', 'r')

fout.write('chrom\tpos\tref\talt\n')
for l in tqdm(f, total=12883863):
    if l[0] == '#':
        continue
    
    l = l.strip('\n').split('\t')
    for alt in l[4].split(','):
        pos = int(l[1])
        ref = l[3]
        
        for i in range(min(len(ref), len(alt))): # iterate through positions in ref and alt
            if ref[i] != alt[i]: # if ref and alt don't match, output a snv
                p = pos - 1 + i
                r = ref[i]
                a = alt[i]
                fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
        if len(ref) > len(alt): # if there's a deletion, the extra bases on the end of ref are what's deleted
            p = pos - 1 + len(alt)
            r = ref[len(alt):]
            a = '*'
            fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
        elif len(ref) < len(alt): # if there's an insertion, the extra bases on the end of alt are what's inserted
            p = pos - 1 + len(ref)
            r = '*'
            a = alt[len(ref):]
            fout.write(f'Chr{l[0]}\t{p}\t{r}\t{a}\n')
f.close()
fout.close()

### MA lines

Downloaded from [Weng 2018](https://doi.org/10.25386/genetics.6456065) File_S1_Mutation_list.zip. Extract the zip and use the Unique_mutations.csv

These are the MA lines used in Ossowski 2010 and other papers from Detlef's lab. Shaw 2000 seems to be the original source. They started with 120 lines, but only 117 made it to G17 and only 107 are sequenced in Weng 2018 

mapped to TAIR10. confirmed 1 based

In [None]:
f = 'data/variant/weng_raw_unique_mutations.csv' # input mutations (renamed the "Unique_mutations.csv" file to this)
fout = 'data/variant/weng_reformatted_muts.tsv' # output file of all lines together (not used in final analysis)
split_dir = 'data/variant/weng_split/' # output directory to put separate mutation files by line

df_weng = pd.read_csv(f)
df_weng = df_weng.dropna(how='all')
df_weng['chrom'] = df_weng.CHROM.apply(lambda x: f'Chr{str(x)[0]}')
df_weng['pos'] = df_weng.POS.apply(lambda x: int(x) - 1)
df_weng['pos'] += df_weng.apply(lambda r: 1 if len(r.REF) > 1 or len(r.ALT) > 1 else 0, axis=1)
df_weng['ref'] = df_weng.apply(lambda r: '*' if len(r.ALT) > 1 else r.REF, axis=1)
df_weng.ref = df_weng.ref.apply(lambda x: x[1:] if len(x) > 1 else x)
df_weng['alt'] = df_weng.apply(lambda r: '*' if len(r.REF) > 1 else r.ALT, axis=1)
df_weng.alt = df_weng.alt.apply(lambda x: x[1:] if len(x) > 1 else x)
df_weng['source'] = df_weng.MA_Line.apply(lambda x: f'weng_{int(x)}')
df_weng = df_weng['chrom pos ref alt source'.split()]

In [None]:
df_weng.to_csv(fout, sep='\t', index=False)

In [None]:
# split each line into its own tsv
os.makedirs(split_dir, exist_ok=True)
for line in set(df_weng.source):
    df_weng[df_weng.source == line].to_csv(f'{split_dir}{line}_muts.tsv', sep='\t', index=False)