<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Keys" data-toc-modified-id="Keys-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Keys</a></span></li><li><span><a href="#Mutation-Liftover" data-toc-modified-id="Mutation-Liftover-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Mutation Liftover</a></span></li><li><span><a href="#Reference-Genome-Conversion" data-toc-modified-id="Reference-Genome-Conversion-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Reference Genome Conversion</a></span></li></ul></div>

## Keys

In [1]:
import gffpandas.gffpandas as gffpd
import pybedtools
import pandas as pd
import numpy as np

## Mutation Liftover

Aim: To convert the location of the mutations in v5.6 to that of the new reference genome (v6)
1. Extracting the locations of the mutations in each of the 28 MA lines and storing them in a dataframe.
2. Converting the dataframe with mutation locations into a bed file.
3. Using the HalLiftover tool to convert the locations of the mutations in v5.6 to v6.

In [None]:
#### Extracting the locations of the mutations in each of the 28 MA lines ####
mutation_location = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/genome_info/Mutation_Location.txt', delimiter = '\t')[['chromosome', 'position', 'mutant_sample']]

CC2931_loc_index = []
CC2344_loc_index = []

for i in list(mutation_location.index.values): 
    if 'CC2931_' in mutation_location['mutant_sample'][i]:
        CC2931_loc_index.append(i)
    if 'CC2344_' in mutation_location['mutant_sample'][i]: 
        CC2344_loc_index.append(i)
        
specific_mut_loc = mutation_location.iloc[CC2931_loc_index + CC2344_loc_index]
specific_mut_loc['end'] = 'NA'

for i in list(specific_mut_loc.index.values):
    specific_mut_loc['end'][i] = specific_mut_loc['position'][i] + 1
specific_mut_loc.columns = ['chromosome', 'start', 'mutant_sample','end']
specific_mut_loc = specific_mut_loc[['chromosome', 'start', 'end', 'mutant_sample']]

#### Converting the dataframe to a bed file ####
new_mutation_location = pybedtools.BedTool.from_dataframe(specific_mut_loc)
new_mutation_location.saveas('/research/projects/chlamydomonas/MAexpression/data/genome_info/v5_coordinates.bed')

#### Using HalLiftover to liftover v5 mutation locations to that of v6 ####
halLiftover Cr_3way.hal CC503v5 v5.coordinates.bed CC4532v6 v6.coordinates.bed
## General code: halLiftover <hal file> CC503v5 <source_bedfile> CC4532v6 <target_bedfile>

## Reference Genome Conversion

Reformatting a reference genome conversion file. 
A file with v5.3 to v6 gene conversions.

In [89]:
conversion = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/genome_info/v5_to_v6_liftover/ChlamydomonasTranscriptNameConversionBetweenReleases.Mch12b.csv', delimiter = ',', skiprows = 1)
conversion.columns = ['5.5', '3', '1', 'Genbank', '4', '4.3', 'u5', 'u9', '5.3.1']
conversion = conversion[~conversion.index.duplicated(keep = 'first')]
conversion.drop(['3', '1', 'Genbank', '4', '4.3', 'u5', 'u9'], axis = 1, inplace = True)
conversion = conversion.replace(' ', '', regex = True)
conversion.set_index('5.3.1', inplace = True)

divergence = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/genome_info/divergence_info/divergence.out.txt', delimiter = '\t', index_col = 'transcript_ID')
PACid2geneID = pd.read_csv('/research/references/chlamydomonas/5.3_chlamy_w_organelles_mt_minus/annotation/PACid2geneID.txt', delimiter = '\t')
PACid2geneID.columns = ['transcript_ID', 'geneid']
PACid2geneID.set_index('transcript_ID', inplace = True)
divergence = divergence.join(PACid2geneID)
divergence.rename(columns = {'geneid': 'v5.3.1'}, inplace = True)
divergence.set_index('v5.3.1', inplace = True)
divergence = divergence.join(conversion)

geneID = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/genome_info/v5_to_v6_liftover/preliminary_v6_Cre_liftover.tsv', delimiter = '\t', header = None)
geneID.columns = ['v6', 'v5']
geneID.set_index('v5', inplace = True)
geneID = geneID[~geneID.index.duplicated(keep = 'first')]
divergence = divergence.reset_index()
divergence['v5_5_gene'] = divergence['5.5'].replace('.t[1-9].*','', regex = True)
divergence.set_index('v5_5_gene', inplace = True)
divergence = divergence[~divergence.index.duplicated(keep = 'first')]
divergence = pd.concat([divergence, geneID], axis = 1)
divergence = divergence.reset_index()
divergence.rename(columns = {'level_0':'v5_5_gene', 'index':'v5.3.1', '5.5':'JGI_v5.5_ID', 'v6':'JGI_v6'}, inplace = True)
divergence.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/divergence/files/divergence_file_v5.3_and_v6.csv', sep = '\t', header = True, index = True)

chlamy_v5_conversion = divergence[['v5_5_gene', 'v5.3.1', 'JGI_v5.5_ID', 'JGI_v6']]
chlamy_v5_conversion.to_csv('/research/projects/chlamydomonas/MAexpression/data/genome_info/v5.3_to_v5.6_to_v6.csv', sep = '\t', header = True, index = False)