<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Keys" data-toc-modified-id="Keys-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Keys</a></span><ul class="toc-item"><li><span><a href="#Percent-of-mapped-reads-(64--80%)" data-toc-modified-id="Percent-of-mapped-reads-(64--80%)-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Percent of mapped reads (64- 80%)</a></span></li><li><span><a href="#Pooling-DESeq2-DEG-data-output" data-toc-modified-id="Pooling-DESeq2-DEG-data-output-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Pooling DESeq2 DEG data output</a></span></li><li><span><a href="#DESeq2-Filtered-raw-data" data-toc-modified-id="DESeq2-Filtered-raw-data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>DESeq2 Filtered raw data</a></span></li><li><span><a href="#Creating-Annotation-files-with-gene-characteristics-using-GFF-files" data-toc-modified-id="Creating-Annotation-files-with-gene-characteristics-using-GFF-files-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Creating Annotation files with gene characteristics using GFF files</a></span></li><li><span><a href="#Extracting-gene-locations-via-GFF-file" data-toc-modified-id="Extracting-gene-locations-via-GFF-file-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Extracting gene locations via GFF file</a></span><ul class="toc-item"><li><span><a href="#Isolating-the-location-of-each-Mutation" data-toc-modified-id="Isolating-the-location-of-each-Mutation-1.5.1"><span class="toc-item-num">1.5.1&nbsp;&nbsp;</span>Isolating the location of each Mutation</a></span></li></ul></li><li><span><a href="#Extracting-iCre1355-metabolic-genes" data-toc-modified-id="Extracting-iCre1355-metabolic-genes-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Extracting iCre1355 metabolic genes</a></span></li></ul></li></ul></div>

# Keys

In [2]:
import json
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.ticker import MaxNLocator
from matplotlib.gridspec import GridSpec
import seaborn as sns
import scipy
from scipy import stats
from scipy.stats import mannwhitneyu
stats.junk = lambda chisq, df: stats.chi2.sf(chisq, df)
import csv
import gffpandas.gffpandas as gffpd
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import os
%load_ext rpy2.ipython

## Percent of mapped reads (64- 80%)

In [20]:
# genecount = pd.DataFrame()    
# import os,sys,re
# import gzip

# path = "/research/projects/chlamydomonas/MAexpression/raw_data"
# for file in os.listdir(path):
#     if re.fullmatch('.*\.fastq.gz', file):
#         fullpath = os.path.join(path, file)
#         result = os.popen('gunzip -c ' + fullpath + ' |wc -l')
#         output = result.read()
#         result = int(output.split('\n')[0])
#         numseqs = int(result)/4.0
#         genecount.at[file, 'total_read'] = numseqs
#         genecount['Unnamed: 0'] = genecount['Unnamed: 0'].str.replace('_R1.fastq.gz', '').str.replace('_R2.fastq.gz', '')
#         genecount = genecount.drop_duplicates()
# genecount.to_csv('/research/projects/chlamydomonas/MAexpression/data/gene_count/total_mapped_reads.csv', sep = '\t', index = True, header = True)

### Finding the percent of reads mapped ####
total_mapped_reads = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/gene_count/total_mapped_reads.csv', delimiter = '\t', index_col = 'Unnamed: 0')

for file in total_mapped_reads.index.values:
    a = pd.read_csv('/research/projects/chlamydomonas/MAexpression/data/gene_count/summaries/' + file + '_genes_count.summary', delimiter = '\t')
    total_mapped_reads.at[file, 'assigned_reads'] = a.iloc[0][1]
total_mapped_reads['percent_mapped_reads'] = total_mapped_reads['assigned_reads']/total_mapped_reads['total_read'] * 100
total_mapped_reads.to_csv('/research/projects/chlamydomonas/MAexpression/data/gene_count/total_mapped_reads.csv', sep = '\t', index = True, header = True)

## Pooling DESeq2 DEG data output

In [2]:
dir_path = r'/research/projects/chlamydomonas/MAexpression/analysis/DEGs/degs/'
total_degs = {}

res = []

# Iterate directory
for path in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, path)):
        res.append(path)

for i in res:
    file_deg = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/DEGs/degs/' + i, delimiter = ',')
    total_degs[i] = file_deg['Unnamed: 0']

total_degs = pd.DataFrame(dict([(k,pd.Series(v)) for k, v in total_degs.items()]))

total_degs.columns = total_degs.columns.str.replace('.resSig','').str.replace('.genes','').str.replace('_', '-')
total_degs.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/DEGs/total_genes1.csv', sep = ',', header = True, index = False)

  total_degs.columns = total_degs.columns.str.replace('.resSig','').str.replace('.genes','').str.replace('_', '-')
  total_degs.columns = total_degs.columns.str.replace('.resSig','').str.replace('.genes','').str.replace('_', '-')


## DESeq2 Filtered raw data

In [3]:
CC2344_l2f = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2344_log2Fold', delimiter = '\t')
CC2931_l2f = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2931_log2Fold', delimiter = '\t')

CC2344_deseq_filtered_reads = CC2344_l2f['Unnamed: 0']
CC2931_deseq_filtered_reads = CC2931_l2f['Unnamed: 0']

#### Extracting the genes that were kept after independent filtering ####
CC2344 = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2344_deseq_normalized_wt_GC', delimiter = '\t', index_col = 'Unnamed: 0')
CC2344_deseq_filtered_normalized = CC2344.loc[CC2344_deseq_filtered_reads]
CC2931 = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2931_deseq_normalized_wt_GC', delimiter = '\t', index_col = 'Unnamed: 0')
CC2931_deseq_filtered_normalized = CC2931.loc[CC2931_deseq_filtered_reads]

#### Redo Column Labels ####
columns = CC2931_deseq_filtered_normalized.columns.str.replace('.','-', regex = True).str.replace('_', '-', regex = True)
CC2931_deseq_filtered_normalized.columns = columns

columns = CC2344_deseq_filtered_normalized.columns.str.replace('.','-', regex = True).str.replace('_', '-', regex = True)
CC2344_deseq_filtered_normalized.columns = columns

#### Saving file ####
CC2344_deseq_filtered_normalized.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2344_deseq_filtered_normalized.txt', sep = '\t', index = True, header = True)
CC2931_deseq_filtered_normalized.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2931_deseq_filtered_normalized.txt', sep = '\t', index = True, header = True)

###################################################
#### Removing gc content and total_length ####
###################################################
CC2344_raw = CC2344_deseq_filtered_normalized[CC2344_deseq_filtered_normalized.columns[:-2]]
CC2931_raw = CC2931_deseq_filtered_normalized[CC2931_deseq_filtered_normalized.columns[:-2]]

#### Taking the mean of the ancestral lines ####
CC2931_raw['CC2931-ANC'] = CC2931_raw[['CC2931-ANC-rep1', 'CC2931-ANC-rep2', 'CC2931-ANC-rep3']].mean(axis = 1)
CC2344_raw['CC2344-ANC'] = CC2344_raw[['CC2344-ANC-rep1', 'CC2344-ANC-rep2', 'CC2344-ANC-rep3']].mean(axis = 1)

CC2931_raw.drop(['CC2931-ANC-rep1', 'CC2931-ANC-rep2', 'CC2931-ANC-rep3'], inplace=True, axis=1)
CC2344_raw.drop(['CC2344-ANC-rep1', 'CC2344-ANC-rep2', 'CC2344-ANC-rep3'], inplace=True, axis=1)

#### Exporting filtered raw counts ####
CC2931_raw.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2931_raw', sep = "\t", index = True, header = True)
CC2344_raw.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/raw_counts/CC2344_raw', sep = "\t", index = True, header = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CC2931_raw['CC2931-ANC'] = CC2931_raw[['CC2931-ANC-rep1', 'CC2931-ANC-rep2', 'CC2931-ANC-rep3']].mean(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CC2344_raw['CC2344-ANC'] = CC2344_raw[['CC2344-ANC-rep1', 'CC2344-ANC-rep2', 'CC2344-ANC-rep3']].mean(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CC2931_raw.drop(['CC2931-ANC-rep1', 'C

## Creating Annotation files with gene characteristics using GFF files

In [None]:
annotation = gffpd.read_gff3('/research/projects/chlamydomonas/MAexpression/genome_info/v6_genome_plus_anno/CC4532.v1_1.gene_exons.gff3')
mRNA = annotation.df
mRNA['attributes'] = mRNA['attributes'].str.split(';', expand = True)
type_gene = mRNA.loc[mRNA['type'] == 'gene']
type_gene['attributes'] = type_gene['attributes'].str.replace('ID=', '', regex = True)
type_gene['length'] = 'nan'
for i in list(type_gene.index.values):
    type_gene.at[i, 'length'] = type_gene.at[i, 'end'] - type_gene.at[i, 'start']
type_gene.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/v6_genes.csv', sep = '\t', header = True, index = False)

#### ADDING THE GENE NAME TO EACH MRNA TYPE ####
type_gene = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/v6_genes.csv', delimiter = '\t')
type_mRNA = pd.DataFrame()
for i in list(type_gene.index.values):
    chromosome = mRNA.loc[mRNA['seq_id'] == type_gene.loc[i, 'seq_id']]
    pacid = chromosome.loc[chromosome['type'] == 'mRNA']
    section = pacid.loc[(pacid['start'] == type_gene.at[i, 'start']) | (pacid['end'] == type_gene.at[i, 'end'])]
    section['genename'] = type_gene.loc[i, 'attributes']
    type_mRNA = type_mRNA.append(section)
type_mRNA.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/v6_mRNA.csv', sep = '\t', index = False, header = True)

#### ASSIGNING A GENENAME TO EACH CDS, EXON AND UTR ####
edited_mRNA = pd.DataFrame()
type_mRNA = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/v6_mRNA.csv', delimiter = '\t')
for i in list(type_mRNA.index.values):
    prefix = type_mRNA.loc[i, 'attributes']
    section = mRNA.loc[mRNA['attributes'].str.startswith(prefix)]
    section['genename'] = type_mRNA.at[i, 'genename']
    edited_mRNA = pd.concat([edited_mRNA, section])
edited_mRNA.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/edited_v6_annotation.csv', sep = '\t', index = False, header = True)

## Extracting gene locations via GFF file

### Isolating the location of each Mutation

In [None]:
### OPENING ANNOTATION GFF FILES ####
type_gene = pd.read_csv('/research/projects/chlamydomonas/MAexpression/analysis/annotation/v6_genes.csv', delimiter = '\t')

#### OPENING FILE WITH MUTATIONS AND THEIR KNOWN LOCATIONS IN V6 ####
mutations = pd.read_csv('/research/projects/chlamydomonas/MAexpression/genome_info/mutation_info/v6_coordinates.bed', delimiter = '\t', names = ['chromosome', 'start', 'end', 'sample'])
mutations['sample'] = mutations['sample'].str.replace('_', '-L', regex = True)
mutations = mutations.sort_values('sample') # There are two missing samples CC2931_8 and CC2931_12, the first one has no recorded mutations
mutations['gene'] = 'intergenic'
mutations['gene_start'] = np.nan
mutations['gene_end'] = np.nan

#### REMOVING ALL CC2931_12 MUTATIONS FROM THE DATAFRAME ####
missing_CC2931_12 = mutations.loc[mutations['sample'] == 'CC2931-L12'].index.values
mutations.drop(missing_CC2931_12, inplace = True)

#### FIND THE GENE THE MUTATION OCCURRED ####
for i in list(mutations.index.values):
    spec_chrom = type_gene.loc[type_gene['seq_id'] == mutations.at[i, 'chromosome']]
    spec_chrom = spec_chrom.loc[(spec_chrom['start'] <= mutations.at[i, 'start']) & (spec_chrom['end'] >= mutations.at[i, 'start'])]
    if len(spec_chrom) != 0:
        mutations.at[i, 'gene'] = spec_chrom['attributes'].values[0]
        mutations.at[i, 'gene_start'] = spec_chrom['start'].values[0]
        mutations.at[i, 'gene_end'] = spec_chrom['end'].values[0]
    if mutations.at[i, 'gene'] == 'intergenic':
        mutations.at[i, 'gene_start'] = mutations.at[i, 'start']
        mutations.at[i, 'gene_end'] = mutations.at[i, 'end']
mutations.to_csv('/research/projects/chlamydomonas/MAexpression/genome_info/mutation_info/all_mutations.csv', sep = "\t", index = False, header = True)

## Extracting iCre1355 metabolic genes

In [73]:
a = pd.read_csv('/research/projects/chlamydomonas/MAexpression/genome_info/iCre1355_Gene_transcript_map.gms.txt', delimiter = '\t')
a['transcripts'] = a['transcripts'].map(lambda x: x.lstrip('t(').rstrip(')=0)=0;'))

for idx in a.index.values:
    a.loc[idx] = a.loc[idx]['transcripts'].replace("')$(g('", '_')
    a.loc[idx] = a.loc[idx]['transcripts'].replace("'", '')
    
a[['transcripts', 'gene']] = a['transcripts'].str.split('_', regex = True, expand = True)
a.to_csv('/research/projects/chlamydomonas/MAexpression/analysis/divergence/iCre1355_genes.csv', sep = '\t', index = False, header = True)