In [1]:
import sys
sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/Diversity-Along-Gut/HumanizedMouse/scripts/")
import config

sys.path.insert(0, "/u/home/m/michaelw/project-ngarud/Diversity-Along-Gut/HumanizedMouse/scripts/helper_functions/")
from annotation import *

import os

import numpy as np
import pandas as pd

import scipy as sc

#PLOTTING
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('text', usetex=True)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath}') 
import seaborn as sns
sns.set_style("whitegrid")

from matplotlib.lines import Line2D


# Directories

In [12]:
cov = 10

single_sample_pi_dir = "%sSinglePi_MinCoverage%s/" % (config.data_directory, str(cov))
paired_pi_dir = "%spopgen_stats/PairedPi_SchloissnigPi.csv" % (config.data_directory)

good_species_path = "%smetadata/good_cvg_species_list.txt" % (config.data_directory)

single_sample_pi_output = "%spopgen_stats/SinglePi_SchloissnigPi_cov%s.csv" % (config.data_directory, str(cov))
paired_sample_pi_output = "%spopgen_stats/PairedPi_SchloissnigPi.csv" % (config.data_directory)

In [3]:
good_species = list(pd.read_csv(good_species_path, names = ['species']).species)

In [5]:
good_species

28

# Load and process data

## Loading single sample pi

In [6]:
single_pi_df = pd.DataFrame(columns = ["species", 
                                       "sample", 
                                       "genomewide_pi", 
                                       "variable_sites", 
                                       "mean_depth", 
                                       "total_loci"])

species_vec = []
sample_vec = []
Genomewide_pi = []
Genomewide_pi_variable_sites = []
Mean_depth = []
n_total_loci = []

for filename in os.listdir(single_sample_pi_dir):
    if "Loci_Stats" in filename:
        continue
    #IDing species
    species = "_".join(filename.split("_")[1:4])
    #IDing sample name
    start_index = filename.find("SampleID1_") + len("SampleID1_")
    end_index = filename.find("_Pi")
    sample = filename[start_index:end_index]
    
    #Loading data
    summary_stats = pd.read_csv("%s%s" % (single_sample_pi_dir, filename), index_col = 0)
    
    #creating vectors
    species_vec.append(species)
    sample_vec.append(sample)
    Genomewide_pi.append(summary_stats['Genomewide_pi'].values[0])
    Genomewide_pi_variable_sites.append(summary_stats['Genomewide_pi_variable_sites'].values[0])
    Mean_depth.append(summary_stats['Mean_depth'].values[0])
    n_total_loci.append(summary_stats['n_total_loci'].values[0])

single_pi_df["species"] = species_vec
single_pi_df["sample"] = sample_vec
single_pi_df["genomewide_pi"] = Genomewide_pi
single_pi_df["variable_sites"] = Genomewide_pi_variable_sites
single_pi_df["mean_depth"] = Mean_depth
single_pi_df["total_loci"] = n_total_loci


In [8]:
single_pi_df

Unnamed: 0,species,sample,genomewide_pi,variable_sites,mean_depth,total_loci
0,Anaerostipes_hadrus_55206,M2IC_CKDN220050977-1A_H7MMHDSX5_L2,0.002988,0.445252,10.737396,5514
1,Alistipes_putredinis_61533,M1CeC_CKDN220050952-1A_H7MMHDSX5_L3,0.000000,,10.921232,10715
2,Anaerostipes_hadrus_55206,M1JC_CKDN220050970-1A_H7MMHDSX5_L2,0.000374,0.371747,10.902700,10925
3,Alistipes_putredinis_61533,M8CeCGG,0.000828,0.211778,19.523531,23288
4,Adlercreutzia_equolifaciens_60310,M1DC_CKDN220050964-1A_H7MMHDSX5_L2,0.004410,0.388293,11.143523,33019
...,...,...,...,...,...,...
864,Sutterella_wadsworthensis_56828,M8ColonGG_2,0.002183,0.174962,46.357149,1887242
865,Parabacteroides_distasonis_56985,M6CoG_CKDN220050963-1A_H7MMHDSX5_L2,0.000070,0.111692,65.913556,2745359
866,Parabacteroides_distasonis_56985,M2JC_CKDN220050971-1A_H7MMHDSX5_L2,0.000079,0.240403,17.277184,2679474
867,Sutterella_wadsworthensis_56828,M8CeCGG,0.003398,0.168227,63.212309,2073803


## Processing single sample pi

In [9]:
#Annotation functions

single_pi_df['mouse'] = single_pi_df['sample'].apply(lambda sample: extract_mouse_number(sample))
single_pi_df['cage'] = single_pi_df['sample'].apply(lambda sample: extract_cage(sample))
single_pi_df['diet'] = single_pi_df['sample'].apply(lambda sample: extract_diet(sample))
single_pi_df['gut_site'] = single_pi_df['sample'].apply(lambda sample: extract_gut_site(sample))
single_pi_df['gut_region'] = single_pi_df['sample'].apply(lambda sample: extract_region(sample))
single_pi_df["good_species"] = [True if species in good_species else False for species in single_pi_df.species]

In [10]:
single_pi_df

Unnamed: 0,species,sample,genomewide_pi,variable_sites,mean_depth,total_loci,mouse,cage,diet,gut_site,gut_region,good_species
0,Anaerostipes_hadrus_55206,M2IC_CKDN220050977-1A_H7MMHDSX5_L2,0.002988,0.445252,10.737396,5514,2,Cage 1,Control diet,Ileum,Upper gut,True
1,Alistipes_putredinis_61533,M1CeC_CKDN220050952-1A_H7MMHDSX5_L3,0.000000,,10.921232,10715,1,Cage 1,Control diet,Cecum,Lower gut,False
2,Anaerostipes_hadrus_55206,M1JC_CKDN220050970-1A_H7MMHDSX5_L2,0.000374,0.371747,10.902700,10925,1,Cage 1,Control diet,Jejunum,Upper gut,True
3,Alistipes_putredinis_61533,M8CeCGG,0.000828,0.211778,19.523531,23288,8,Cage 3,Guar gum diet,Cecum,Lower gut,False
4,Adlercreutzia_equolifaciens_60310,M1DC_CKDN220050964-1A_H7MMHDSX5_L2,0.004410,0.388293,11.143523,33019,1,Cage 1,Control diet,Duodenum,Upper gut,True
...,...,...,...,...,...,...,...,...,...,...,...,...
864,Sutterella_wadsworthensis_56828,M8ColonGG_2,0.002183,0.174962,46.357149,1887242,8,Cage 3,Guar gum diet,Colon,Lower gut,True
865,Parabacteroides_distasonis_56985,M6CoG_CKDN220050963-1A_H7MMHDSX5_L2,0.000070,0.111692,65.913556,2745359,6,Cage 3,Guar gum diet,Colon,Lower gut,True
866,Parabacteroides_distasonis_56985,M2JC_CKDN220050971-1A_H7MMHDSX5_L2,0.000079,0.240403,17.277184,2679474,2,Cage 1,Control diet,Jejunum,Upper gut,True
867,Sutterella_wadsworthensis_56828,M8CeCGG,0.003398,0.168227,63.212309,2073803,8,Cage 3,Guar gum diet,Cecum,Lower gut,True


## Saving

In [13]:
single_sample_pi_output

'/u/project/ngarud/Garud_lab/HumanizedMouse/HumanizedMouse_Batch2/popgen_stats/SinglePi_SchloissnigPi_cov10.csv'

In [14]:
single_pi_df.to_csv(single_sample_pi_output)

#### Filtering to include only good species




In [None]:
single_pi_df = single_pi_df[single_pi_df.good_species]

In [None]:
single_pi_df

In [None]:
species_with_inoculum_maximum = []

# single_pi_df.groupby(["species", "sample"]).mean().sort_values(["species", "sample"], ascending = False).loc['Sutterella_wadsworthensis_56828']
species_pi = single_pi_df.groupby(["species", "sample"]).mean()#.loc['Sutterella_wadsworthensis_56828']['genomewide_pi'].idxmax()
species_list = list(set([tpl[0] for tpl in species_pi.index]))
for species in species_list:
    max_sample = species_pi.loc[species]['genomewide_pi'].idxmax()
    if max_sample == "TL1gDNAshort":
        species_with_inoculum_maximum.append(species)



In [None]:
inoculum_percent_pi_df = pd.DataFrame(columns = species_pi.columns) #, index = species_pi.index

for species in species_list:
    species_chunk = species_pi.loc[species]
    inoculum_pi = species_chunk.loc['TL1gDNAshort']
    species_percent_inoculum_pi = species_pi.loc["Sutterella_wadsworthensis_56828"]
    species_percent_inoculum_pi['genomewide_pi'] = species_percent_inoculum_pi['genomewide_pi']/inoculum_pi
    inoculum_percent_pi_df = pd.concat([inoculum_percent_pi_df, species_percent_inoculum_pi], ignore_index=False)

    

In [None]:
inoculum_percent_pi_df

In [None]:
species_pi.loc["Sutterella_wadsworthensis_56828"].drop(index = 'TL1gDNAshort')

#### Calculations

In [None]:
pd.DataFrame(single_pi_df[single_pi_df['sample'] != "TL1gDNAshort"].groupby("species")["genomewide_pi"].mean()).sort_values("genomewide_pi", ascending = False)




# Paired sample pi

## Loading paired sample pi

In [None]:
paired_pi_df = pd.read_csv(paired_pi_dir)
paired_pi_df = paired_pi_df.drop(["Unnamed: 0", "level_0", "Unnamed: 0.1"], axis = 1)
paired_pi_df = paired_pi_df.reset_index()

## Processing paired sample pi

In [None]:
paired_pi_df = paired_pi_df[[True if "OverallStats_NA" not in index else False for index in paired_pi_df['index']]].copy()
paired_pi_df['species'] = paired_pi_df['species'].apply(lambda species: "_".join(species.split("_")[:3]))
paired_pi_df['mouse_1'] = paired_pi_df['Sample1'].apply(lambda sample: sample[1:2] if sample != "TL1gDNAshort" else "Inoculum")
paired_pi_df['mouse_2'] = paired_pi_df['Sample2'].apply(lambda sample: sample[1:2] if sample != "TL1gDNAshort" else "Inoculum")
paired_pi_df['cage_1'] = paired_pi_df['mouse_1'].apply(lambda mouse: "Cage 1" if mouse in ["1","2","3"] else "Cage 2" if mouse in ["4","5"] else "Cage 3" if mouse in ["6","7","8"] else "Inoculum")
paired_pi_df['cage_2'] = paired_pi_df['mouse_2'].apply(lambda mouse: "Cage 1" if mouse in ["1","2","3"] else "Cage 2" if mouse in ["4","5"] else "Cage 3" if mouse in ["6","7","8"] else "Inoculum")
paired_pi_df['gut_site_1'] = paired_pi_df['Sample1'].apply(lambda sample: "Duodenum" if sample[2:3] == "D" else "Jejunum" if sample[2:3] == "J" else "Ileum" if sample[2:3] == "I" else "Cecum" if sample[2:4] == "Ce" else "Colon" if sample[2:4] == "Co" else "Inoculum")
paired_pi_df['gut_site_2'] = paired_pi_df['Sample2'].apply(lambda sample: "Duodenum" if sample[2:3] == "D" else "Jejunum" if sample[2:3] == "J" else "Ileum" if sample[2:3] == "I" else "Cecum" if sample[2:4] == "Ce" else "Colon" if sample[2:4] == "Co" else "Inoculum")
paired_pi_df['region_1'] = paired_pi_df['gut_site_1'].apply(lambda gut_site: "Upper gut" if gut_site in ["Duodenum", "Ileum", "Jejunum"] else "Lower gut" if gut_site in ["Cecum", "Colon"] else "Inoculum")
paired_pi_df['region_2'] = paired_pi_df['gut_site_2'].apply(lambda gut_site: "Upper gut" if gut_site in ["Duodenum", "Ileum", "Jejunum"] else "Lower gut" if gut_site in ["Cecum", "Colon"] else "Inoculum")
paired_pi_df['orientation'] = paired_pi_df.apply(lambda row: "Within host" if row['mouse_1'] == row['mouse_2'] else "Between inoculum" if (row['mouse_1'] == "Inoculum") | (row['mouse_2'] == "Inoculum") else "Between host", axis = 1)
paired_pi_df['cage_orientation'] = paired_pi_df.apply(lambda row: "Between inoculum" if (row['mouse_1'] == "Inoculum") | (row['mouse_2'] == "Inoculum") 
                                                      else 
                                                      "Between cage" if row['cage_1'] != row['cage_2'] 
                                                      else 
                                                      "Within cage", axis = 1)
paired_pi_df['pi_or_fst'] = paired_pi_df['index'].apply(lambda string: "pi" if string.split("_")[-1] == "Pi.csv" else "Fst" if string.split("_")[-1] == "Fst.csv" else "Exclude")
paired_pi_df = paired_pi_df[paired_pi_df.pi_or_fst == "pi"]
paired_pi_df[['Sample1', 'Sample2']] = paired_pi_df[['Sample1', 'Sample2']].apply(lambda row: pd.Series(sorted(row)), axis=1)
paired_pi_df = paired_pi_df.drop_duplicates(['species','Sample1','Sample2'])
paired_pi_df['good_species'] = [True if species in good_species else False for species in paired_pi_df.species]


#### Saving

In [None]:
paired_pi_df.to_csv(paired_sample_pi_output)

In [None]:
paired_sample_pi_output

#### Filtering for good species

In [None]:
paired_pi_df = paired_pi_df[[True if species in good_species else False for species in paired_pi_df.species]]

#### Calculations

In [None]:
pd.DataFrame(paired_pi_df[paired_pi_df.orientation == "Between host"].groupby("species")['pi_vec'].mean()).sort_values("pi_vec")


In [None]:
pd.DataFrame(paired_pi_df[paired_pi_df.orientation == "Within host"].groupby("species")['pi_vec'].mean()).sort_values("pi_vec")


# Correlations

### Single sample pi

In [None]:
mouse_average = pd.DataFrame(single_pi_df[single_pi_df['sample'] != "TL1gDNAshort"].groupby(by = ['species'])['genomewide_pi'].mean()).reset_index()



In [None]:
inoculum = pd.DataFrame(single_pi_df[single_pi_df['sample'] == "TL1gDNAshort"].set_index('species').genomewide_pi).reset_index()


In [None]:
merged_df = mouse_average.merge(inoculum, on = 'species', how = 'left')
merged_df_multi_strain = merged_df[merged_df.genomewide_pi_y >= 0.001]
merged_df_single_strain = merged_df[merged_df.genomewide_pi_y < 0.001]

In [None]:
sc.stats.spearmanr(merged_df['genomewide_pi_x'], merged_df['genomewide_pi_y'], nan_policy = "omit")

In [None]:
sc.stats.spearmanr(merged_df_multi_strain['genomewide_pi_x'], merged_df_multi_strain['genomewide_pi_y'], nan_policy = "omit")


In [None]:
sc.stats.spearmanr(merged_df_single_strain['genomewide_pi_x'], merged_df_single_strain['genomewide_pi_y'], nan_policy = "omit")



#### Not grouped

In [None]:
mouse_single = pd.DataFrame(single_pi_df[single_pi_df['sample'] != "TL1gDNAshort"])
merged_df = mouse_single.merge(inoculum, on = 'species', how = 'left')
merged_df_multi_strain = merged_df[merged_df.genomewide_pi_y >= 0.001]
merged_df_single_strain = merged_df[merged_df.genomewide_pi_y < 0.001]


In [None]:
sc.stats.spearmanr(merged_df['genomewide_pi_x'], merged_df['genomewide_pi_y'], nan_policy = "omit")

In [None]:
sc.stats.spearmanr(merged_df_multi_strain['genomewide_pi_x'], merged_df_multi_strain['genomewide_pi_y'], nan_policy = "omit")


In [None]:
sc.stats.spearmanr(merged_df_single_strain['genomewide_pi_x'], merged_df_single_strain['genomewide_pi_y'], nan_policy = "omit")



In [None]:
- merged_df_single_strain['genomewide_pi_y']

### paired pi

In [None]:
mouse_average = pd.DataFrame(paired_pi_df[(paired_pi_df['Sample2'] != "TL1gDNAshort") & (paired_pi_df['orientation'] != "Between host")].groupby(by = ['species'])['pi_vec'].mean()).reset_index()


In [None]:
merged_df = mouse_average.merge(inoculum, on = 'species', how = 'left')
merged_df_multi_strain = merged_df[merged_df.genomewide_pi >= 0.001]
merged_df_single_strain = merged_df[merged_df.genomewide_pi < 0.001]

In [None]:
sc.stats.spearmanr(merged_df['pi_vec'], merged_df['genomewide_pi'], nan_policy = "omit")

In [None]:
sc.stats.spearmanr(merged_df_multi_strain['pi_vec'], merged_df_multi_strain['genomewide_pi'], nan_policy = "omit")


In [None]:
sc.stats.spearmanr(merged_df_single_strain['pi_vec'], merged_df_single_strain['genomewide_pi'], nan_policy = "omit")



In [None]:
merged_df = mouse_average.merge(inoculum, on = 'species', how = 'left')
merged_df_multi_strain = merged_df[merged_df.genomewide_pi_y >= 0.001]
merged_df_single_strain = merged_df[merged_df.genomewide_pi_y < 0.001]

# Plotting

## Intersample pi vs. inoculum pi

### 1. Create inoculum single pi dataframe

In [None]:
inoculum_pi = single_pi_df[single_pi_df.gut_site == "Inoculum"].reset_index().drop(['index'], axis = 1).rename(columns = {"genomewide_pi": "pi_vec"}).copy()




### 2. Create between host paired pi df

In [None]:
paired_pi_betweenhost = paired_pi_df[paired_pi_df['orientation'] == 'Between host'].reset_index().drop(['index'], axis =1)


### 3. Create within host paired pi df

In [None]:
paired_pi_withinhost = paired_pi_df[paired_pi_df['orientation'] == 'Within host'].reset_index().drop(['index'], axis =1)


### 4. Create within sample pi df

In [None]:
#median_single_pi = (single_pi_df[single_pi_df['gut_site'] != "Inoculum"].groupby(['species'])['genomewide_pi'].median()).reset_index().rename(columns = {"genomewide_pi":"pi_vec"})
single_pi = single_pi_df[single_pi_df['gut_site'] != "Inoculum"].reset_index().drop(['index'], axis =1).rename(columns = {"genomewide_pi":"pi_vec"})



### 5. Merging datasets

In [None]:
### Merging with inoculum
between_df = pd.merge(inoculum_pi[['species', 'pi_vec']],
                     paired_pi_betweenhost[['species', 'pi_vec']],
                     on='species').rename(columns = {"pi_vec_x": "Inoculum pi", "pi_vec_y": "Intersample pi, between host"})

within_df = pd.merge(inoculum_pi[['species', 'pi_vec']],
                     paired_pi_withinhost[['species', 'pi_vec']],
                     on='species').rename(columns = {"pi_vec_x": "Inoculum pi", "pi_vec_y": "Intersample pi, within host"})

single_df = pd.merge(inoculum_pi[['species', 'pi_vec']], 
                                   single_pi[['species', 'pi_vec']], 
                                   on = 'species').rename(columns = {"pi_vec_x": "Inoculum pi", "pi_vec_y": "Intrasample pi"})



In [None]:
axis_max = max(list(inoculum_pi['pi_vec']) + 
               list(between_df['Intersample pi, between host']) + 
               list(within_df['Intersample pi, within host']) + 
               list(single_df['Intrasample pi']))*1.1


## Plot 1: Between mice

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = between_df, x = "Inoculum pi", y = "Intersample pi, between host")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, between host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,axis_max)
ax.set_ylim(0,axis_max)


In [None]:
inoculum_pi[(inoculum_pi['pi_vec'] > 0.012)]

In [None]:
file_name = "%s" % ("pi_scatter_betweenhost.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = between_df, x = "Inoculum pi", y = "Intersample pi, between host")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, between host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,0.00125)
ax.set_ylim(0,0.00125)

In [None]:
inoculum_pi[(inoculum_pi['pi_vec'] > 0.001) & (inoculum_pi['pi_vec'] < 0.0011)]

In [None]:
file_name = "%s" % ("pi_scatter_betweenhost_ZOOMED.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

## Plot 2: Intersample, within mice

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = within_df, x = "Inoculum pi", y = "Intersample pi, within host")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, within host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,axis_max)
ax.set_ylim(0,axis_max)

In [None]:
inoculum_pi[(inoculum_pi['pi_vec'] > 0.001) & (inoculum_pi['pi_vec'] < 0.0012)]

In [None]:
within_df[within_df['Intersample pi, within host'] > 0.008]

In [None]:
paired_pi_withinhost[paired_pi_withinhost['pi_vec'] == 0.00908]

In [None]:
paired_pi_withinhost[paired_pi_withinhost.pi_vec > 0.009]

In [None]:
file_name = "%s" % ("pi_scatter_withinhost.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = within_df, x = "Inoculum pi", y = "Intersample pi, within host")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, within host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,0.00125)
ax.set_ylim(0,0.00125)

In [None]:
file_name = "%s" % ("pi_scatter_withinhost_ZOOMED.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

## Plot 3: intrasample pi

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = single_df, x = "Inoculum pi", y = "Intrasample pi")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intrasample $\pi$", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,axis_max)
ax.set_ylim(0,axis_max)

In [None]:
file_name = "%s" % ("pi_scatter_intrasample.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.scatterplot(data = single_df, x = "Inoculum pi", y = "Intrasample pi")

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intrasample $\pi$", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
x_vals = np.arange(0,1,0.001)
y_vals = np.arange(0,1,0.001)
ax.plot(x_vals, x_vals, 'r--') 

ax.set_xlim(0,0.00125)
ax.set_ylim(0,0.00125)

In [None]:
file_name = "%s" % ("pi_scatter_intrasample_ZOOMED.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

# Bimodal plot

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

sns.histplot(inoculum_pi.rename(columns={"pi_vec": "pi"}).pi, ax=ax, kde=True, bins = 100)

threshold = 0.001
for bar in ax.patches:
    if bar.get_x() > threshold:
        bar.set_color('red')

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

legend_elements = [
    Line2D([0], [0], marker='o', color='w', label='Inoculum $\pi \ge 10^{-3}$', markerfacecolor='red', markersize=15),
    Line2D([0], [0], marker='o', color='w', label='Inoculum $\pi < 10^{-3}$', markerfacecolor='blue', markersize=15)
]
legend_elements.extend([
    Line2D([0], [0], color='black', linestyle='--', label='Inoculum $\pi = 10^{-3}$')
])

sns.scatterplot(data = between_df, x = "Inoculum pi", y = "Intersample pi, between host", hue=(between_df["Inoculum pi"] >= 0.001), palette={True: 'red', False: 'blue'})

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, between host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
ax.axvline(0.001, color='black', linestyle='--')
# x_vals = np.arange(0,1,0.001)
# y_vals = np.arange(0,1,0.001)
# ax.plot(x_vals, x_vals, 'r--') 

ax.legend(handles=legend_elements, fontsize = 20)


ax.set_xlim(0,axis_max)
# ax.set_xlim(0.002,0.003)
ax.set_ylim(0,axis_max)


In [None]:
inoculum_pi[(inoculum_pi['pi_vec'] > 0.002) & (inoculum_pi['pi_vec'] < 0.003)]

In [None]:
file_name = "%s" % ("pi_scatter_betweenhost_colored.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)

### Label ovatus

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 8))

legend_elements = [
    Line2D([0], [0], marker='o', color='w', label='$\emph{Bacteroides ovatus}$', markerfacecolor='red', markersize=15)
]
legend_elements.extend([
    Line2D([0], [0], color='black', linestyle='--', label='Inoculum $\pi = 10^{-3}$')
])


sns.scatterplot(data = between_df[between_df["Inoculum pi"] >= 0.001], x = "Inoculum pi", y = "Intersample pi, between host", hue=(between_df["species"]  == "Bacteroides_ovatus_58035"), palette={True: 'red', False: 'blue'})

# Set axis labels and plot title
ax.set_xlabel("Inoculum $\pi$", fontsize=20)
ax.set_ylabel("Intersample $\pi$, between host", fontsize=20)
ax.tick_params(axis='both', which='both', labelsize=20)

# add x = y lines
ax.axvline(0.001, color='black', linestyle='--')
# x_vals = np.arange(0,1,0.001)
# y_vals = np.arange(0,1,0.001)
# ax.plot(x_vals, x_vals, 'r--') 

ax.legend(handles=legend_elements, fontsize = 20)


ax.set_xlim(0,axis_max)
# ax.set_xlim(0.002,0.003)
ax.set_ylim(0,axis_max)


In [None]:
file_name = "%s" % ("pi_scatter_betweenhost_Bovatus.png")
path = "%spopgen_stats/%s" % (config.figure_directory, file_name)
fig.savefig(path, bbox_inches = "tight", dpi = 300)