In [1]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/microbiome_evolution/microbiome_evolution_SHALON/")

import config
import pandas as pd
import numpy as np
from datetime import datetime

import subprocess

import pickle

import os

import itertools

import bz2

#MIDAS postprocessing scripts
from calculate_intersample_changes import *
import parse_midas_data
import diversity_utils
import core_gene_utils
import parse_patric



import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline  


within_host_changes_condition = True



### Species

In [2]:
species_list_path = "/u/project/ngarud/Garud_lab/metagenomic_fastq_files/Shalon_2023/metadata/species_snps.txt"

with open(species_list_path) as f:
    species_list = [line.strip() for line in f if line.strip()]

In [3]:
species_list[:10]

['Acidaminococcus_intestini_54097',
 'Actinomyces_graevenitzii_58300',
 'Actinomyces_sp_57735',
 'Actinomyces_sp_62581',
 'Actinomyces_viscosus_57672',
 'Adlercreutzia_equolifaciens_60310',
 'Aggregatibacter_aphrophilus_58143',
 'Akkermansia_muciniphila_55290',
 'Alistipes_finegoldii_56071',
 'Alistipes_indistinctus_62207']

In [4]:
good_species = []

for species in species_list:
    haploid_samples = diversity_utils.calculate_haploid_samples(species)
    if len(haploid_samples) > 1:
        good_species.append(species)
    else:
        continue

In [10]:
good_species[:5]

['Acidaminococcus_intestini_54097',
 'Akkermansia_muciniphila_55290',
 'Alistipes_finegoldii_56071',
 'Alistipes_onderdonkii_55464',
 'Alistipes_putredinis_61533']

In [7]:
species = "Bacteroides_massiliensis_44749"
haploid_samples = diversity_utils.calculate_haploid_samples(species)
haploid_samples_hmp = diversity_utils.calculate_haploid_samples(species, use_HMP_freqs = True)

# Loop to capture within host SNV changes

In [35]:
haploid_samples

array(['SRR18585014', 'SRR18585015', 'SRR18585017', 'SRR18585018',
       'SRR18585019', 'SRR18585020', 'SRR18585021', 'SRR18585023',
       'SRR18585022', 'SRR18585024', 'SRR18585025', 'SRR18585026',
       'SRR18585028', 'SRR18585172', 'SRR18585174', 'SRR18585176'],
      dtype='|S11')

In [27]:
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sample_metadata_map = parse_midas_data.parse_sample_order_map()

#Initialize df
snp_changes_df = pd.DataFrame(columns = ['species','contig','site_pos','gene','variant_type','sample1', 'sample2', 'alternate_freq_1', 'depth_1', 'alternate_freq_2', 'depth_2'])
accession_1 = []
accession_2 = []
species_vec = []
contig = []
site_pos = []
gene = []
variant_type = []
sample1 = []
sample2 = []
alternate_freq_1 = []
depth_1 = []
alternate_freq_2 = []
depth_2 = []
opportunities_vec = []
gene_description_vec = []


counter = 0
no_of_species = len(good_species)
species = "Bacteroides_massiliensis_44749"
counter += 1
print "%s%s (%s/%s species)" % ("Processing ", species, counter, no_of_species)
#gene descriptions
genome_ids = parse_midas_data.get_ref_genome_ids(species)
non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species)
gene_descriptions = parse_patric.load_patric_gene_descriptions(genome_ids, non_shared_genes)
centroid_gene_map = parse_midas_data.load_centroid_gene_map(species)
#haploid samples
haploid_samples = diversity_utils.calculate_haploid_samples(species, use_HMP_freqs = True)
#change map
intersample_change_map = load_intersample_change_map(species)
intersample_change_map_within_host = {}

if within_host_changes_condition:

    for subject in subject_sample_map:
        host_samples = subject_sample_map[subject].keys()
        within_host_sample_combos = list(itertools.combinations(host_samples, 2))

        for sample_combo in within_host_sample_combos:
            if sample_combo in intersample_change_map:
                intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
            elif (sample_combo[1], sample_combo[0]) in within_host_sample_combos:
                sample_combo = (sample_combo[1], sample_combo[0])
                intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
            else: 
                continue
    intersample_change_map = intersample_change_map_within_host
    
#Intersample change map subsetting
#     ICMs = {key: intersample_change_map[key]['snps'] for key in intersample_change_map.keys() if len(intersample_change_map[key]['snps'][2]) > 0} #do we actually want this?
ICMs = {key: intersample_change_map[key]['snps'] for key in intersample_change_map.keys()}
ICMs = {key: ICMs[key] for key in ICMs.keys() if ((key[0] in haploid_samples) & (key[1] in haploid_samples))}

if len(ICMs) != 0:
    for key in ICMs.keys():
        for snp in ICMs[key][2]:
            accession_1.append(key[0])
            accession_2.append(key[1])
            species_vec.append(species)
            contig.append(snp[1])
            site_pos.append(snp[2])
            gene.append(snp[0])
            variant_type.append(snp[3])
            sample1.append(key[0])
            sample2.append(key[1])
            alternate_freq_1.append(snp[4])
            depth_1.append(snp[5])
            alternate_freq_2.append(snp[6])
            depth_2.append(snp[7])
            if snp[0] in gene_descriptions:
                gene_description_vec.append(gene_descriptions[snp[0]])
            elif snp[0] in centroid_gene_map:
                if centroid_gene_map[snp[0]] in gene_descriptions:
                    gene_description_vec.append(gene_descriptions[centroid_gene_map[snp[0]]])
            else:
                gene_description_vec.append("")

Processing Bacteroides_massiliensis_44749 (1/69 species)


In [33]:
ICMs = {key: intersample_change_map[key]['snps'] for key in intersample_change_map.keys()}
ICMs = {key: ICMs[key] for key in ICMs.keys() if ((key[0] in haploid_samples) & (key[1] in haploid_samples))}


In [34]:
ICMs

{('SRR18585014', 'SRR18585015'): (2427270.0, 6.93445e-10, []),
 ('SRR18585014', 'SRR18585017'): (2431730.0, 4.92197e-11, []),
 ('SRR18585015', 'SRR18585017'): (2423810.0, 5.27426e-10, []),
 ('SRR18585018', 'SRR18585019'): (2350560.0, 8.02574e-08, []),
 ('SRR18585020', 'SRR18585022'): (2405880.0, 1.12478e-21, []),
 ('SRR18585020', 'SRR18585023'): (2406250.0, 1.62153e-22, []),
 ('SRR18585021', 'SRR18585022'): (2430200.0, 1.21175e-17, []),
 ('SRR18585021', 'SRR18585023'): (2430510.0, 7.58251e-18, []),
 ('SRR18585024', 'SRR18585026'): (2433280.0, 4.76305e-16, []),
 ('SRR18585025', 'SRR18585026'): (2415330.0, 1.68232e-10, [])}

In [None]:
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sample_metadata_map = parse_midas_data.parse_sample_order_map()

#Initialize df
snp_changes_df = pd.DataFrame(columns = ['species','contig','site_pos','gene','variant_type','sample1', 'sample2', 'alternate_freq_1', 'depth_1', 'alternate_freq_2', 'depth_2'])
accession_1 = []
accession_2 = []
species_vec = []
contig = []
site_pos = []
gene = []
variant_type = []
sample1 = []
sample2 = []
alternate_freq_1 = []
depth_1 = []
alternate_freq_2 = []
depth_2 = []
opportunities_vec = []
gene_description_vec = []

counter = 0
no_of_species = len(good_species)

for species in good_species:
    counter += 1
    
    print "%s%s (%s/%s species)" % ("Processing ", species, counter, no_of_species)
    
    #gene descriptions
    genome_ids = parse_midas_data.get_ref_genome_ids(species)
    non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species)
    gene_descriptions = parse_patric.load_patric_gene_descriptions(genome_ids, non_shared_genes)
    centroid_gene_map = parse_midas_data.load_centroid_gene_map(species)
    
    #haploid samples
    haploid_samples = diversity_utils.calculate_haploid_samples(species, use_HMP_freqs = True)
    
    #change map
    intersample_change_map = load_intersample_change_map(species)

    intersample_change_map_within_host = {}
    
    if within_host_changes_condition:

        for subject in subject_sample_map:
            host_samples = subject_sample_map[subject].keys()
            within_host_sample_combos = list(itertools.combinations(host_samples, 2))

            for sample_combo in within_host_sample_combos:
                if sample_combo in intersample_change_map:
                    intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
                elif (sample_combo[1], sample_combo[0]) in within_host_sample_combos:
                    sample_combo = (sample_combo[1], sample_combo[0])
                    intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
                else: 
                    continue

        intersample_change_map = intersample_change_map_within_host
    
    #Intersample change map subsetting
#     ICMs = {key: intersample_change_map[key]['snps'] for key in intersample_change_map.keys() if len(intersample_change_map[key]['snps'][2]) > 0} #do we actually want this?
    ICMs = {key: intersample_change_map[key]['snps'] for key in intersample_change_map.keys()}
    ICMs = {key: ICMs[key] for key in ICMs.keys() if ((key[0] in haploid_samples) & (key[1] in haploid_samples))}
    
    
    if len(ICMs) != 0:
        for key in ICMs.keys():
            for snp in ICMs[key][2]:
                accession_1.append(key[0])
                accession_2.append(key[1])
                species_vec.append(species)
                contig.append(snp[1])
                site_pos.append(snp[2])
                gene.append(snp[0])
                variant_type.append(snp[3])
                sample1.append(key[0])
                sample2.append(key[1])
                alternate_freq_1.append(snp[4])
                depth_1.append(snp[5])
                alternate_freq_2.append(snp[6])
                depth_2.append(snp[7])
                if snp[0] in gene_descriptions:
                    gene_description_vec.append(gene_descriptions[snp[0]])
                elif snp[0] in centroid_gene_map:
                    if centroid_gene_map[snp[0]] in gene_descriptions:
                        gene_description_vec.append(gene_descriptions[centroid_gene_map[snp[0]]])
                else:
                    gene_description_vec.append("")
                
    else:
        continue

snp_changes_df['accession_1'] = accession_1
snp_changes_df['accession_2'] = accession_2
snp_changes_df['species'] = species_vec
snp_changes_df['contig'] = contig
snp_changes_df['site_pos'] = site_pos
snp_changes_df['gene'] = gene
snp_changes_df['variant_type'] = variant_type
snp_changes_df['sample1'] = sample1
snp_changes_df['sample2'] = sample2
snp_changes_df['alternate_freq_1'] = alternate_freq_1
snp_changes_df['depth_1'] = depth_1
snp_changes_df['alternate_freq_2'] = alternate_freq_2
snp_changes_df['depth_2'] = depth_2
snp_changes_df['gene_description'] = gene_description_vec


### Annotate snp_changes_df

In [None]:
metadata_map = parse_midas_data.parse_sample_metadata_map()

In [None]:
if within_host_changes:
    snp_changes_df['subject'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][0])
else:
    snp_changes_df['subject_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][0])
    snp_changes_df['subject_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][0])

snp_changes_df['type_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][2])
snp_changes_df['type_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][2])

snp_changes_df['swallow_date_time_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][3])
snp_changes_df['swallow_date_time_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][3])

snp_changes_df['sample_set_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][8])
snp_changes_df['sample_set_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][8])

snp_changes_df['sample_type_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][9])
snp_changes_df['sample_type_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][9])

snp_changes_df['location_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][10])
snp_changes_df['location_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][10])

snp_changes_df['day_1'] = snp_changes_df.sample1.apply(lambda sample: metadata_map[sample][11])
snp_changes_df['day_2'] = snp_changes_df.sample2.apply(lambda sample: metadata_map[sample][11])

### Saving full snv dataframe

In [None]:
evolution_folder = "%s%s" % (config.project_directory, "evolutionary_changes/")
if not os.path.exists(evolution_folder):
    # Create the directory if it doesn't exist
    os.makedirs(evolution_folder)
    print("Directory '{}' created successfully.".format(evolution_folder))
else:
    print("Directory '{}' already exists.".format(evolution_folder))
if within_host_changes_condition:
    snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes_WithinHost.txt.bz2")
else:
    snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes.txt.bz2")
snp_changes_df.to_csv(snv_changes_df_output, index = False, sep = ",")


### Loading snv dataframe

In [None]:
evolution_folder = "%s%s" % (config.project_directory, "evolutionary_changes/")
if within_host_changes_condition:
    snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes_WithinHost.txt.bz2")
else:
    snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes.txt.bz2")
snp_changes_df = pd.read_csv(snv_changes_df_output, sep = ",")

# Making the opportunities dataframe

In [None]:
metadata_map = parse_midas_data.parse_sample_metadata_map()
opportunities_df = pd.DataFrame(columns = ['species', 'accession_1', 'accession_2', 'opportunities'])

species_vec = []
sample_1 = []
sample_2 = []
opportunity_vec = []

for species in good_species:
    print "Processing " + species
    intersample_change_map = load_intersample_change_map(species)
    
    haploid_samples = diversity_utils.calculate_haploid_samples(species, use_HMP = True) #MW: 06/08: Changed to true
    sample_pairs = list(itertools.combinations(haploid_samples, 2))


    for sample_pair in sample_pairs:
        if sample_pair in intersample_change_map:
            opportunities = intersample_change_map[sample_pair]['snps'][0]
        elif (sample_pair[1], sample_pair[0]) in intersample_change_map:
            sample_pair = (sample_pair[1], sample_pair[0])
            opportunities = intersample_change_map[sample_pair]['snps'][0]
        else: 
            print "Error: " + str(sample_pair) + "not found."
            continue

        species_vec.append(species)
        sample_1.append(sample_pair[0])
        sample_2.append(sample_pair[1])
        opportunity_vec.append(opportunities)
        #snv_changes

opportunities_df['species'] = species_vec
opportunities_df['accession_1'] = sample_1
opportunities_df['accession_2'] = sample_2
opportunities_df['opportunities'] = opportunity_vec

#Annotation
opportunities_df['subject_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][0])
opportunities_df['subject_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][0])

opportunities_df['type_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][2])
opportunities_df['type_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][2])

opportunities_df['swallow_date_time_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][3])
opportunities_df['swallow_date_time_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][3])

opportunities_df['sample_set_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][8])
opportunities_df['sample_set_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][8])

opportunities_df['sample_type_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][9])
opportunities_df['sample_type_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][9])

opportunities_df['location_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][10])
opportunities_df['location_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][10])

opportunities_df['day_1'] = opportunities_df.accession_1.apply(lambda sample: metadata_map[sample][11])
opportunities_df['day_2'] = opportunities_df.accession_2.apply(lambda sample: metadata_map[sample][11])


### Saving opportunities dataframe

In [None]:
evolution_folder = "%s%s" % (config.project_directory, "evolutionary_changes/")
if not os.path.exists(evolution_folder):
    # Create the directory if it doesn't exist
    os.makedirs(evolution_folder)
    print("Directory '{}' created successfully.".format(evolution_folder))
else:
    print("Directory '{}' already exists.".format(evolution_folder))
# if within_host_changes:
#     snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes_WithinHost.txt.bz2")
# else:
#     snv_changes_df_output = "%s%s" % (evolution_folder, "snp_changes.txt.bz2")
opportunities_df_output = "%s%s" % (evolution_folder, "opportunities.txt.bz2")
opportunities_df.to_csv(opportunities_df_output, index = False, sep = ",")


### Loading opportunities dataframe

In [None]:
evolution_folder = "%s%s" % (config.project_directory, "evolutionary_changes/")
opportunities_df_output = "%s%s" % (evolution_folder, "opportunities.txt.bz2")
opportunities_df = pd.read_csv(opportunities_df_output, sep = ",")

## Within set data

In [None]:
condition_1 = (snp_changes_df.sample_type_1 == "Capsule") & (snp_changes_df.sample_type_2 == "Capsule")
condition_2 = (snp_changes_df.sample_set_1 == snp_changes_df.sample_set_2)
condition_3 = [True if sample_set in ["1","2","3","4","5"] else False for sample_set in snp_changes_df.sample_set_1]

within_set_changes = snp_changes_df[condition_1 & condition_2].sort_values(by = ['species', 'contig','gene'])


## Within host data

In [None]:
snp_changes_df.columns

In [None]:
condition_1 = (snp_changes_df.sample_type_1 == "Capsule") & (snp_changes_df.sample_type_2 == "Capsule")

within_host_changes = snp_changes_df[condition_1].sort_values(by = ['species', 'contig','gene'])


# TIME ANALYSIS - WITHIN SET

## Subject 9
**Species:** Desulfovibrio_piger_61475
- **Gene of interest:** 411464.8.peg.774
- **Annotation:** Formate dehydrogenase O alpha subunit (EC 1.2.1.2) @ selenocysteine-containing

## Subject 11
**Species:** Bacteroides_vulgatus_57955
- **Region of interest:** NC_009614, 363467 - 5148355

In [None]:
#1. Set variables
species = "Anaerostipes_hadrus_55206"
subject = "8"
subject_integer = int(subject)
##Extract haploid subject samples
subject_samples = parse_midas_data.parse_subject_sample_map()[subject].keys()
haploid_samples = diversity_utils.calculate_haploid_samples(species, min_coverage=10)
subject_samples = [sample for sample in subject_samples if sample in haploid_samples]
##Make a metadataframe
metadata_map = parse_midas_data.parse_sample_metadata_map()
subject_sample_metadata = pd.DataFrame()
subject_sample_metadata['sample'] = subject_samples
subject_sample_metadata['sample_type'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][9])
subject_sample_metadata['sample_set'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][8])
subject_sample_metadata['location'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][10])







In [None]:
#2. Extract contig and locus information 
loci_of_interest = set(within_set_changes[within_set_changes['subject'] == subject].sort_values(by = ['site_pos'])[['contig', 'site_pos']].drop_duplicates().to_records(index=False).tolist())


In [None]:
#3. Pull out allele frequencies of variants of interest in subject samples


## Load the bz2 file
data_directory = config.data_directory
snp_file =  bz2.BZ2File("%ssnps/%s/annotated_snps.txt.bz2" % (data_directory, species),"r")

## Setting up the loop
num_sites_processed = 0
num_extracted_sites = 0
line_number = -1
final_line_number = -1
initial_line_number = -100
previous_gene_name = ""
gene_name = ""
chunk_size = 20000000

## Header info
line = snp_file.readline() 
items = line.split()[1:]    
samples_in_file = sample_utils.parse_merged_sample_names(items)

# Sample indices
desired_sample_idxs = []
for sample in subject_samples:
    desired_sample_idxs.append( numpy.nonzero(samples_in_file==sample)[0][0] )
desired_sample_idxs = numpy.array(desired_sample_idxs)    
desired_samples = samples_in_file[desired_sample_idxs]

## Initializing
chrom_vec = []
location_vec = []
loci_extracted = 0

#pd.DataFrame(columns = desired_samples)

for line in snp_file: 
    
    line_number += 1
    
    previous_gene_name = gene_name
    
    if line_number%100000==0:
        sys.stderr.write("%dk sites processed...\n" % (line_number/1000)) 
    
    items = line.split()
    # Load information about site
    info_items = items[0].split("|")
    chromosome = info_items[0]
    location = long(info_items[1])
    gene_name = info_items[2]
    variant_type = info_items[3]
    location_tuple = (chromosome, int(location))
    #If it's not one of the sites of interest, move to the next line
    
    if line_number >= chunk_size and gene_name!=previous_gene_name:
        # We are done for now!
        final_line_number = line_number
        sys.stderr.write("Breaking at line " + str(final_line_number) + "\n")
        break
        
    if location_tuple not in loci_of_interest:
        continue
    else:
        sys.stderr.write("EXTRACTING: " + str(location_tuple) + "\n")
        chrom_vec.append(chromosome)
        location_vec.append(location)
        loci_extracted += 1
    
    ## Getting alts and depths for sites of interest
    alts = []
    depths = []

    for idx in desired_sample_idxs:    
        item = items[1+idx]
        subitems = item.split(",")
        alts.append(float(subitems[0]))
        depths.append(float(subitems[1]))
    
    if loci_extracted == 1:
        alts_total = numpy.array(alts)
        depths_total = numpy.array(depths)
    else:
        alts_total = numpy.vstack((alts_total, numpy.array(alts)))
        depths_total = numpy.vstack((depths_total, numpy.array(depths)))
    
    
    num_sites_processed += 1
        
    

    
snp_file.close()

## creating dataframe with alts, depths, and freqs for each sample
alts_df = pd.DataFrame(data = alts_total, columns = desired_samples)
depths_df = pd.DataFrame(data = depths_total, columns = desired_samples)
alts_df['contig'] = chrom_vec
alts_df['site_pos'] = location_vec
depths_df['contig'] = chrom_vec
depths_df['site_pos'] = location_vec
alts_df = alts_df.melt(var_name='sample', value_name='alt', id_vars = ['contig', 'site_pos'])
depths_df = depths_df.melt(var_name='sample', value_name='depth', id_vars = ['contig', 'site_pos'])
freq_df = pd.merge(alts_df, depths_df, on=['sample', 'site_pos', 'contig'])
freq_df['allele_frequency'] = freq_df['alt']/freq_df['depth']
## Merge with subject_sample map
freq_df = pd.merge(freq_df, subject_sample_metadata, on = 'sample')


In [None]:
#4. Repolarize
polarize = True

if species == "Desulfovibrio_piger_61475":
    if polarize:
        sites_to_repolarize = [147085, 147116]
        freq_df['polarized_af'] = freq_df.apply(lambda row: 1-row['allele_frequency'] if row.site_pos in sites_to_repolarize else row['allele_frequency'], axis = 1)

if species == "Bacteroides_vulgatus_57955":
    if polarize:
        sites_to_repolarize = freq_df[(freq_df.location == "Small intestine 2") & (freq_df.sample_set == "3") & (freq_df.allele_frequency < 0.4)].site_pos.values.tolist()
        freq_df['polarized_af'] = freq_df.apply(lambda row: 1-row['allele_frequency'] if row.site_pos in sites_to_repolarize else row['allele_frequency'], axis = 1)

if species == "Anaerostipes_hadrus_55206":
    if polarize:
        sites_to_repolarize = freq_df[(freq_df.location == "Ascending colon") & (freq_df.sample_set == "3") & (freq_df.allele_frequency < 0.6)].site_pos.values.tolist()
        freq_df['polarized_af'] = freq_df.apply(lambda row: 1-row['allele_frequency'] if row.site_pos in sites_to_repolarize else row['allele_frequency'], axis = 1)



In [None]:
#5. Saving
output_path = "%sevolutionary_changes/%s_withinsetchanges.csv" % (config.project_directory, species)
freq_df.to_csv(output_path, index = False, sep = "\t")

In [None]:
#6. Plotting
sns.set(font_scales = 5)

plotting_df = freq_df[freq_df.sample_set != "5"]

g = sns.FacetGrid(data=plotting_df, col = "sample_set", sharex=False, height = 10, aspect=1, legend_out=True)
g.map(sns.lineplot, 'location', 'polarized_af', 'site_pos',palette=sns.color_palette("Set1", plotting_df.site_pos.nunique()))
g.set_titles(fontsize=20)

# Set labels and title
g.set_axis_labels("Location", "Allele Frequency")




In [None]:
#7. Plotting individual sample sets
plotting_df = freq_df[freq_df.sample_set == "3"]

fig, ax = plt.subplots(figsize=(20, 8))
# g = sns.FacetGrid(data=plotting_df, col = "sample_set")
# g.map(sns.lineplot, 'location', 'allele_frequency', 'site_pos',palette=sns.color_palette("Set1", plotting_df.site_pos.nunique()))

sns.lineplot(data=plotting_df, x='location', y='allele_frequency', hue='site_pos',palette=sns.color_palette("Set1", plotting_df.site_pos.nunique()), ax = ax)



# TIME ANALYSIS - WITHIN HOST

In [None]:
#1. Identify species that display within host changes
within_host_species = within_host_changes.species.unique()

In [None]:
#2. Identify subjects in which species displays evolutionary changes
species = "Bacteroides_vulgatus_57955"
subjects = list(within_host_changes[(within_host_changes['species'] == species)].subject.unique())

In [None]:
metadata_map

In [None]:
#3. For the subject in the current loop, pull out relevant samples and create a metadata table
subject = "8"
subject_integer = int(subject)
##Extract haploid subject samples
subject_samples = parse_midas_data.parse_subject_sample_map()[subject].keys()
haploid_samples = diversity_utils.calculate_haploid_samples(species, min_coverage=10)
subject_samples = [sample for sample in subject_samples if sample in haploid_samples]
##Make a metadataframe
metadata_map = parse_midas_data.parse_sample_metadata_map()
subject_sample_metadata = pd.DataFrame()
subject_sample_metadata['sample'] = subject_samples
subject_sample_metadata['sample_type'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][9])
subject_sample_metadata['sample_set'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][8])
subject_sample_metadata['location'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][10])
subject_sample_metadata['swallow_date'] = subject_sample_metadata['sample'].apply(lambda sample: '' if metadata_map[sample][3] == '' else datetime.strptime(metadata_map[sample][3], "%Y-%m-%dT%H:%M:%SZ").date())
subject_sample_metadata['collection_date'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][6])
subject_sample_metadata['subject'] = subject
subject_sample_metadata['species'] = species

#4. Extract contig and locus information 
loci_of_interest = set(within_host_changes[(within_host_changes['subject'] == subject_integer) & (within_host_changes['species'] == species)].sort_values(by = ['site_pos'])[['contig', 'site_pos']].drop_duplicates().to_records(index=False).tolist())
no_of_loci = len(loci_of_interest)

#5. Pull out allele frequencies of variants of interest in subject samples


## Load the bz2 file
data_directory = config.data_directory
snp_file =  bz2.BZ2File("%ssnps/%s/annotated_snps.txt.bz2" % (data_directory, species),"r")

## Setting up the loop
num_sites_processed = 0
num_extracted_sites = 0
line_number = -1
final_line_number = -1
initial_line_number = -100
previous_gene_name = ""
gene_name = ""
chunk_size = 20000000

## Header info
line = snp_file.readline() 
items = line.split()[1:]    
samples_in_file = sample_utils.parse_merged_sample_names(items)

# Sample indices
desired_sample_idxs = []
for sample in subject_samples:
    desired_sample_idxs.append( numpy.nonzero(samples_in_file==sample)[0][0] )
desired_sample_idxs = numpy.array(desired_sample_idxs)    
desired_samples = samples_in_file[desired_sample_idxs]

## Gene descriptions
genome_ids = parse_midas_data.get_ref_genome_ids(species)
non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(species)
gene_descriptions = parse_patric.load_patric_gene_descriptions(genome_ids, non_shared_genes)
centroid_gene_map = parse_midas_data.load_centroid_gene_map(species)

## Initializing
chrom_vec = []
location_vec = []
gene_descriptions_vec = []
loci_extracted = 0

#pd.DataFrame(columns = desired_samples)

for line in snp_file: 
    
    line_number += 1
    
    previous_gene_name = gene_name
    
    if line_number%100000==0:
        sys.stderr.write("%dk sites processed...\n" % (line_number/1000)) 
    
    items = line.split()
    # Load information about site
    info_items = items[0].split("|")
    chromosome = info_items[0]
    location = long(info_items[1])
    gene_name = info_items[2]
    variant_type = info_items[3]
    location_tuple = (chromosome, int(location))
    #If it's not one of the sites of interest, move to the next line
    
    if line_number >= chunk_size and gene_name!=previous_gene_name:
        # We are done for now!
        final_line_number = line_number
        sys.stderr.write("Breaking at line " + str(final_line_number) + "\n")
        break
        
    if location_tuple not in loci_of_interest:
        continue
    else:
        sys.stderr.write("EXTRACTING: " + str(location_tuple) + "\n")
        chrom_vec.append(chromosome)
        location_vec.append(location)
        loci_extracted += 1
    
    ## Getting alts and depths for sites of interest
    alts = []
    depths = []

    for idx in desired_sample_idxs:    
        item = items[1+idx]
        subitems = item.split(",")
        alts.append(float(subitems[0]))
        depths.append(float(subitems[1]))
    
    if loci_extracted == 1:
        alts_total = numpy.array(alts)
        depths_total = numpy.array(depths)
    else:
        alts_total = numpy.vstack((alts_total, numpy.array(alts)))
        depths_total = numpy.vstack((depths_total, numpy.array(depths)))
    
    if gene_name in gene_descriptions:
        gene_descriptions_vec.append(gene_descriptions[gene_name])
    elif gene_name in centroid_gene_map:
        if centroid_gene_map[gene_name] in gene_descriptions:
            gene_descriptions_vec.append(gene_descriptions[centroid_gene_map[gene_name]])
    else:
        gene_descriptions_vec.append("")
    
    num_sites_processed += 1
    
    if num_sites_processed == no_of_loci:
        sys.stderr.write("Successfully extracted all sites.\n")
        break
        
    

    
snp_file.close()

## creating dataframe with alts, depths, and freqs for each sample
if no_of_loci == 1:
    alts_df = pd.DataFrame(data = [alts_total], columns = desired_samples)
    depths_df = pd.DataFrame(data = [depths_total], columns = desired_samples)
else:
    alts_df = pd.DataFrame(data = alts_total, columns = desired_samples)
    depths_df = pd.DataFrame(data = depths_total, columns = desired_samples)
alts_df['contig'] = chrom_vec
alts_df['site_pos'] = location_vec
alts_df['gene_descriptions'] = gene_descriptions_vec
depths_df['contig'] = chrom_vec
depths_df['site_pos'] = location_vec
alts_df = alts_df.melt(var_name='sample', value_name='alt', id_vars = ['contig', 'site_pos', 'gene_descriptions'])
depths_df = depths_df.melt(var_name='sample', value_name='depth', id_vars = ['contig', 'site_pos'])
freq_df = pd.merge(alts_df, depths_df, on=['sample', 'site_pos', 'contig'])
freq_df['allele_frequency'] = freq_df['alt']/freq_df['depth']
# ## Merge with subject_sample map
freq_df = pd.merge(freq_df, subject_sample_metadata, on = 'sample')




sys.stderr.write("\nDONE! with %s\n" % (species))



In [None]:
freq_df

In [None]:
#5. for loop version

subjects = list(within_host_changes[(within_host_changes['species'] == species)].subject.unique())
freq_df_final = pd.DataFrame()

for subject in subjects:
    sys.stderr.write("Processing subject %s in %s.\n" % (subject, species))
    #3. For the subject in the current loop, pull out relevant samples and create a metadata table
    subject = str(subject)
    subject_integer = int(subject)
    ##Extract haploid subject samples
    subject_samples = parse_midas_data.parse_subject_sample_map()[subject].keys()
    haploid_samples = diversity_utils.calculate_haploid_samples(species, min_coverage=10)
    subject_samples = [sample for sample in subject_samples if sample in haploid_samples]
    ##Make a metadataframe
    metadata_map = parse_midas_data.parse_sample_metadata_map()
    subject_sample_metadata = pd.DataFrame()
    subject_sample_metadata['sample'] = subject_samples
    subject_sample_metadata['sample_type'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][9])
    subject_sample_metadata['sample_set'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][8])
    subject_sample_metadata['location'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][10])
    subject_sample_metadata['swallow_date'] = subject_sample_metadata['sample'].apply(lambda sample: '' if metadata_map[sample][3] == '' else datetime.strptime(metadata_map[sample][3], "%Y-%m-%dT%H:%M:%SZ").date())
    subject_sample_metadata['collection_date'] = subject_sample_metadata['sample'].apply(lambda sample: metadata_map[sample][6])
    subject_sample_metadata['subject'] = subject
    subject_sample_metadata['species'] = species

    #4. Extract contig and locus information 
    loci_of_interest = set(within_host_changes[(within_host_changes['subject'] == subject_integer) & (within_host_changes['species'] == species)].sort_values(by = ['site_pos'])[['contig', 'site_pos']].drop_duplicates().to_records(index=False).tolist())
    no_of_loci = len(loci_of_interest)

    #5. Pull out allele frequencies of variants of interest in subject samples


    ## Load the bz2 file
    data_directory = config.data_directory
    snp_file =  bz2.BZ2File("%ssnps/%s/annotated_snps.txt.bz2" % (data_directory, species),"r")

    ## Setting up the loop
    num_sites_processed = 0
    num_extracted_sites = 0
    line_number = -1
    final_line_number = -1
    initial_line_number = -100
    previous_gene_name = ""
    gene_name = ""
    chunk_size = 20000000

    ## Header info
    line = snp_file.readline() 
    items = line.split()[1:]    
    samples_in_file = sample_utils.parse_merged_sample_names(items)

    # Sample indices
    desired_sample_idxs = []
    for sample in subject_samples:
        desired_sample_idxs.append( numpy.nonzero(samples_in_file==sample)[0][0] )
    desired_sample_idxs = numpy.array(desired_sample_idxs)    
    desired_samples = samples_in_file[desired_sample_idxs]

    ## Initializing
    chrom_vec = []
    location_vec = []
    loci_extracted = 0

    #pd.DataFrame(columns = desired_samples)

    for line in snp_file: 

        line_number += 1

        previous_gene_name = gene_name

        if line_number%1000000==0:
            sys.stderr.write("%dk sites processed...\n" % (line_number/1000)) 

        items = line.split()
        # Load information about site
        info_items = items[0].split("|")
        chromosome = info_items[0]
        location = long(info_items[1])
        gene_name = info_items[2]
        variant_type = info_items[3]
        location_tuple = (chromosome, int(location))
        #If it's not one of the sites of interest, move to the next line

        if line_number >= chunk_size and gene_name!=previous_gene_name:
            # We are done for now!
            final_line_number = line_number
            sys.stderr.write("Breaking at line " + str(final_line_number) + "\n")
            break

        if location_tuple not in loci_of_interest:
            continue
        else:
            sys.stderr.write("EXTRACTING: " + str(location_tuple) + "\n")
            chrom_vec.append(chromosome)
            location_vec.append(location)
            loci_extracted += 1

        ## Getting alts and depths for sites of interest
        alts = []
        depths = []

        for idx in desired_sample_idxs:    
            item = items[1+idx]
            subitems = item.split(",")
            alts.append(float(subitems[0]))
            depths.append(float(subitems[1]))

        if loci_extracted == 1:
            alts_total = numpy.array(alts)
            depths_total = numpy.array(depths)
        else:
            alts_total = numpy.vstack((alts_total, numpy.array(alts)))
            depths_total = numpy.vstack((depths_total, numpy.array(depths)))


        num_sites_processed += 1
        
        if num_sites_processed == no_of_loci:
            sys.stderr.write("Successfully extracted all sites.\n")
            break




    snp_file.close()

    ## creating dataframe with alts, depths, and freqs for each sample
    if no_of_loci == 1:
        alts_df = pd.DataFrame(data = [alts_total], columns = desired_samples)
        depths_df = pd.DataFrame(data = [depths_total], columns = desired_samples)
    else:
        alts_df = pd.DataFrame(data = alts_total, columns = desired_samples)
        depths_df = pd.DataFrame(data = depths_total, columns = desired_samples)
    alts_df['contig'] = chrom_vec
    alts_df['site_pos'] = location_vec
    depths_df['contig'] = chrom_vec
    depths_df['site_pos'] = location_vec
    alts_df = alts_df.melt(var_name='sample', value_name='alt', id_vars = ['contig', 'site_pos'])
    depths_df = depths_df.melt(var_name='sample', value_name='depth', id_vars = ['contig', 'site_pos'])
    freq_df = pd.merge(alts_df, depths_df, on=['sample', 'site_pos', 'contig'])
    freq_df['allele_frequency'] = freq_df['alt']/freq_df['depth']
    # ## Merge with subject_sample map
    freq_df = pd.merge(freq_df, subject_sample_metadata, on = 'sample')
    
    freq_df_final = freq_df_final.append(freq_df)

    sys.stderr.write("Done with subject %s!\n\n" % (subject))


In [None]:
evolution_folder = "%s%s" % (config.project_directory, "evolutionary_changes/")
if not os.path.exists(evolution_folder):
    # Create the directory if it doesn't exist
    os.makedirs(evolution_folder)
    print("Directory '{}' created successfully.".format(evolution_folder))
else:
    print("Directory '{}' already exists.".format(evolution_folder))
snv_frequencies_output = "%s%s" % (evolution_folder, "snv_frequencies.txt.bz2")
freq_df_final.to_csv(snv_frequencies_output, index = False, sep = ",")


# GENOMIC LOCI ANALYSIS

In [None]:
#1. Get genome length
species = "Bacteroides_vulgatus_57955"
species_length_file_path = "%smerged_data/snps/%s/snps_summary.txt" % (config.project_folder, species)
species_length = pd.read_csv(species_length_file_path, sep = "\t")['genome_length'].unique()[0]

In [None]:
#2. Get SNV positions and contigs
snv_contigs = within_set_changes.loc[within_set_changes['species'] == species, 'contig']
snv_positions = within_set_changes.loc[within_set_changes['species'] == species, 'site_pos']
snv_positions_df = pd.DataFrame({'contigs': snv_contigs, 'positions': snv_positions})

In [None]:
# Assuming snv_positions_df is your DataFrame with 'positions' column
positions = snv_positions_df['positions']

# Plotting
fig, ax = plt.subplots(figsize=(10, 1))
plt.vlines(positions, 0, 1, colors='red', linewidth=0.5)  # Plotting vertical red lines
plt.xlim(0, species_length)  # Setting x-axis limits
plt.ylim(0, 1)  # Setting y-axis limits
plt.gca().axes.get_yaxis().set_visible(False)  # Hide y-axis labels
plt.title('{} SNV positions along the genome'.format(species))
plt.xlabel('Position')

In [None]:
out_path = "%sfigures/evolutionary_changes/%s_genomic_loci.png" % (config.analysis_directory,species)
fig.savefig(out_path, dpi=300, bbox_inches='tight')  # Save as PNG file


In [None]:
within_set_changes.species.unique()

# max number of differences

In [None]:
grouped_counts = snp_changes_df.groupby(['species', 'subject', 'sample1', 'sample2']).size().reset_index(name='row_count').sort_values(by = ['row_count'], ascending = False)


In [None]:
grouped_counts.reset_index()

In [None]:
metadata_map = parse_midas_data.parse_sample_metadata_map()


In [None]:
metadata_map

In [None]:
metadata_map['SRR18585059']

# Individually per species

### Directories

In [None]:
#overall directories
metadata_directory = "%s%s" % (config.metadata_directory, "shalon_metadata.txt")
gene_info_directory = "%s%s" % (config.project_folder, "gene_descriptions/")

### Metadata

In [None]:
metadata = pd.read_csv(metadata_directory, sep = "\t")#.rename(columns = {'Sample Name': 'sample_alias'})


### Data maps

In [None]:
subject_sample_map = parse_midas_data.parse_subject_sample_map()
sample_metadata_map = parse_midas_data.parse_sample_order_map()

### Load intersample changes

In [None]:
species = "Anaerostipes_hadrus_55206"
intersample_change_map = load_intersample_change_map(species)

### Isolate within-host changes

In [None]:
intersample_change_map_within_host = {}

for subject in subject_sample_map:
    host_samples = subject_sample_map[subject].keys()
    within_host_sample_combos = list(itertools.combinations(host_samples, 2))
    
    for sample_combo in within_host_sample_combos:
        if sample_combo in intersample_change_map:
            intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
        elif (sample_combo[1], sample_combo[0]) in within_host_sample_combos:
            sample_combo = (sample_combo[1], sample_combo[0])
            intersample_change_map_within_host[sample_combo] = intersample_change_map[sample_combo]
        else: 
            continue
    

In [None]:
intersample_change_map_within_host