In [35]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import statsmodels
import statsmodels.api as sm
from statsmodels.multivariate.manova import MANOVA
from scipy import stats

from useful_functions import *

In [36]:
tables_to_join = ["tSoilMicrobeGenera", "tSample", "tPlantDNA"]
common_column = "sample_id"
full_df = import_all_data(tables_to_join, common_column)
full_df.shape

(394056, 15)

In [37]:
#drop duplicate columns
df = full_df.copy()
df = df.T.drop_duplicates().T
df.shape

(394056, 13)

In [38]:
pivoted_df = df.pivot(index=['sample_id', 'site_id', 'species', 'V2', 'collect_date'], columns='genera', values='relative_abundance')
pivoted_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,genera,Archaea|Candidatus Korarchaeota||||Candidatus Korarchaeum,Archaea|Crenarchaeota|Thermoprotei|Acidilobales|Acidilobaceae|Acidilobus,Archaea|Crenarchaeota|Thermoprotei|Acidilobales|Caldisphaeraceae|Caldisphaera,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Aeropyrum,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Desulfurococcus,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Ignicoccus,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Staphylothermus,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Thermogladius,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Desulfurococcaceae|Thermosphaera,Archaea|Crenarchaeota|Thermoprotei|Desulfurococcales|Pyrodictiaceae|Hyperthermus,...,Viruses||||Geminiviridae|Begomovirus,Viruses||||Inoviridae|Inovirus,Viruses||||Iridoviridae|Iridovirus,Viruses||||Microviridae|Phix174microvirus,Viruses||||Mimiviridae|Hokovirus,Viruses||||Mimiviridae|Klosneuvirus,Viruses||||Mimiviridae|Tupanvirus,Viruses||||Partitiviridae|Alphapartitivirus,Viruses||||Polydnaviridae|Bracovirus,Viruses|||||Pandoravirus
sample_id,site_id,species,V2,collect_date,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
CMB10,CMB,syriaca,0.999777121,6/29/2021,0.0,0.0,0.0,0.00068,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMB11,CMB,syriaca,0.999999983,6/29/2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMB13,CMB,syriaca,0.989130596,6/29/2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMB15,CMB,syriaca,0.998547653,6/29/2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMB16,CMB,syriaca,0.999243483,6/29/2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00308,0.0,0.0,0.0,0.0


In [39]:
pivoted_df.shape

(156, 2526)

In [50]:
pivoted_df['site_id'].unique()

array(['CMB', 'FRW', 'HR', 'LFS', 'LM', 'MKP', 'MMP', 'PNR', 'PTW', 'RF',
       'RGT', 'RRL', 'SLG'], dtype=object)

In [40]:
row_sum = pivoted_df.sum(axis=1)
pivoted_df = pivoted_df.reset_index()

In [43]:
genera_col = pivoted_df.columns
genera_col = genera_col.drop(['sample_id','site_id', 'species', 'V2', 'collect_date'])

sub_pivot = pivoted_df[genera_col].astype(float).copy()

In [44]:
# make all of the rows sum to 1
for row in range(sub_pivot.shape[0]):
    sub_pivot.loc[row] = sub_pivot.loc[row]/row_sum[row]

# find the sum of each column
new_sum = sub_pivot.sum(axis=1)
new_sum

# recombine original df with samples with new summed/"oned" data
pivoted_df[genera_col] = sub_pivot

row_sum = pivoted_df[genera_col].sum(axis=1)
print(row_sum)

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
151    1.0
152    1.0
153    1.0
154    1.0
155    1.0
Length: 156, dtype: float64


In [None]:
### PCA ALL DATA

In [9]:
pc1_df = pivoted_df.copy()

In [11]:
X = pc1_df[genera_col]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pc1_df['site_id'])
fig.update_layout(
    title = "Soil Microbes by Site without Reduction",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

In [12]:
### Keeping top 1000 most common genera

In [58]:
pc2_df = pivoted_df.copy()

In [59]:
sum_col = pc2_df[genera_col].sum()
top_100 = sum_col.nlargest(10)
select_columns = (top_100.index).tolist()
select_columns.insert(0, 'V2')
select_columns.insert(0, 'sample_id')
select_columns.insert(0, 'site_id')
select_columns.insert(0, 'species')
select_columns.insert(0, 'collect_date')
pc2_df = pc2_df[select_columns]

In [60]:
pc2_df.head()

genera,collect_date,species,site_id,sample_id,V2,Bacteria|Proteobacteria|Alphaproteobacteria|Rhizobiales|Bradyrhizobiaceae|Bradyrhizobium,Bacteria|Actinobacteria|Actinobacteria|Streptomycetales|Streptomycetaceae|Streptomyces,Bacteria|Cyanobacteria||Synechococcales|Synechococcaceae|Synechococcus,Bacteria|Proteobacteria|Gammaproteobacteria|Pseudomonadales|Pseudomonadaceae|Pseudomonas,Bacteria|Cyanobacteria||Synechococcales|Synechococcaceae|Thermosynechococcus,Viruses|||Caudovirales|Podoviridae|T7virus,Bacteria|Actinobacteria|Actinobacteria|Corynebacteriales|Mycobacteriaceae|Mycobacterium,Bacteria|Proteobacteria|Alphaproteobacteria|Sphingomonadales|Sphingomonadaceae|Sphingomonas,Bacteria|Actinobacteria|Actinobacteria|Propionibacteriales|Nocardioidaceae|Nocardioides,Bacteria|Proteobacteria|Gammaproteobacteria|Xanthomonadales|Xanthomonadaceae|Stenotrophomonas
0,6/29/2021,syriaca,CMB,CMB10,0.999777,0.093727,0.080597,0.00051,0.024729,0.0,0.0,0.031655,0.022974,0.011929,0.015269
1,6/29/2021,syriaca,CMB,CMB11,1.0,0.097675,0.082029,0.000599,0.033588,4.9e-05,0.0,0.023893,0.023239,0.009437,0.012375
2,6/29/2021,syriaca,CMB,CMB13,0.989131,0.122075,0.100412,0.000539,0.03124,0.0,0.0,0.027619,0.018817,0.009399,0.006745
3,6/29/2021,syriaca,CMB,CMB15,0.998548,0.149311,0.080603,0.0,0.067397,0.0,0.0,0.049229,0.026631,0.009515,0.00497
4,6/29/2021,syriaca,CMB,CMB16,0.999243,0.098761,0.092399,0.000494,0.049438,0.0,0.0,0.025583,0.018831,0.008655,0.008429


In [61]:
pc2_df[top_100.index] = StandardScaler().fit_transform(pc2_df[top_100.index])  

In [65]:
pc2_df['site_id'].replace({'CMB': 1, 'FRW': 1, 'LFS': 1, 'LM': 0, 'MMP': 1, 'MKP': 2,
                    'PNR': 2, 'PTW': 0, 'RF': 2, 'RGT': 0, 'RRL': 0, 'SGC': 0, 'SLG': 1,'HR': 3}, inplace=True) #
pc2_df = pc2_df[pc2_df["site_id"].isin([0,1,2,3])]

pc2_df['site_id'].unique()

pc2_df['site_id'] = pc2_df['site_id'].replace([0], 'Wintergreen')
pc2_df['site_id'] = pc2_df['site_id'].replace([1], 'Cole Mountain')
pc2_df['site_id'] = pc2_df['site_id'].replace([2], 'Blacksburg')
pc2_df['site_id'] = pc2_df['site_id'].replace([3], 'HR')

In [66]:
X = pc2_df[top_100.index]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pc2_df['site_id'])
fig.update_layout(
    title = "100 Most Abundant Leaf Microbe Genera by Site PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

In [21]:
pc2_df['site_id'].replace({'CMB': 1, 'FRW': 1, 'LFS': 1, 'LM': 0, 'MMP': 1, 'MKP': 2,
                    'PNR': 2, 'PTW': 0, 'RF': 2, 'RGT': 0, 'RRL': 0, 'SGC': 0, 'SLG': 1,'HR': 3}, inplace=True) #
pc2_df = pc2_df[pc2_df["site_id"].isin([0,1,2,3])]

pc2_df['site_id'].unique()

pc2_df['site_id'] = pc2_df['site_id'].replace([0], 'Wintergreen')
pc2_df['site_id'] = pc2_df['site_id'].replace([1], 'Cole Mountain')
pc2_df['site_id'] = pc2_df['site_id'].replace([2], 'Blacksburg')
pc2_df['site_id'] = pc2_df['site_id'].replace([3], 'HR')

In [22]:
X = pc2_df[top_100.index]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pc2_df['site_id'])
fig.update_layout(
    title = "100 Most Abundant Leaf Microbe Genera by Broad Geographic Location PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

In [23]:
### Keping top 100 genera

In [28]:
pc3_df = pivoted_df.copy()

In [29]:
sum_col = pc3_df[genera_col].sum()
top_50 = sum_col.nlargest(100)
select_columns = (top_50.index).tolist()
select_columns.insert(0, 'V2')
select_columns.insert(0, 'sample_id')
select_columns.insert(0, 'site_id')
select_columns.insert(0, 'species')
pc3_df = pc3_df[select_columns]

In [30]:
pc3_df[top_50.index] = StandardScaler().fit_transform(pc3_df[top_50.index])  

In [31]:
X = pc3_df[top_50.index]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pc3_df['site_id'])
fig.update_layout(
    title = "50 Most Abundant Leaf Microbe Genera by Site PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

In [32]:
pc3_df['site_id'].replace({'CMB': 1, 'FRW': 1, 'LFS': 1, 'LM': 0, 'MMP': 1, 'MKP': 2,
                    'PNR': 2, 'PTW': 0, 'RF': 2, 'RGT': 0, 'RRL': 0, 'SGC': 0, 'SLG': 1,'HR': 3}, inplace=True) #
pc3_df = pc3_df[pc3_df["site_id"].isin([0,1,2,3])]

pc3_df['site_id'].unique()

pc3_df['site_id'] = pc3_df['site_id'].replace([0], 'Wintergreen')
pc3_df['site_id'] = pc3_df['site_id'].replace([1], 'Cole Mountain')
pc3_df['site_id'] = pc3_df['site_id'].replace([2], 'Blacksburg')
pc3_df['site_id'] = pc3_df['site_id'].replace([3], 'HR')

In [33]:
X = pc3_df[top_50.index]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pc3_df['site_id'])
fig.update_layout(
    title = "50 Most Abundant Leaf Microbe Genera by Broad Geographic Location PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()