In [32]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import statsmodels
import statsmodels.api as sm
from statsmodels.multivariate.manova import MANOVA
from scipy import stats

from useful_functions import *

## Import Data

In [33]:
tables_to_join = ["tPlantMicrobeGenera", "tSample", "tPlantDNA"]
common_column = "sample_id"
full_df = import_all_data(tables_to_join, common_column)
full_df.shape

(277695, 15)

In [34]:
full_df.head()

Unnamed: 0,sample_id,genera,relative_abundance,sample_id.1,site_id,collect_date,sample_id.2,V1,V2,species,PC1,PC2,PC3,PC4,PC5
0,RRL1,Archaea|Euryarchaeota|Halobacteria|Halobacteri...,0.0,RRL1,RRL,6/25/2021,RRL1,0.753244,0.246756,exaltata,0.194769,0.048276,-0.02388,-0.017138,-0.039842
1,RRL2,Archaea|Euryarchaeota|Halobacteria|Halobacteri...,0.01333,RRL2,RRL,6/25/2021,RRL2,0.708412,0.291588,exaltata,0.154924,0.071277,0.000923,-0.02583,0.005592
2,RRL3,Archaea|Euryarchaeota|Halobacteria|Halobacteri...,0.0,RRL3,RRL,6/25/2021,RRL3,0.778046,0.221954,exaltata,0.18775,0.035009,-0.016006,-0.012529,-0.011169
3,RRL4,Archaea|Euryarchaeota|Halobacteria|Halobacteri...,0.00943,RRL4,RRL,6/25/2021,RRL4,0.720367,0.279633,exaltata,0.167167,0.046053,-0.032754,-0.016192,-0.028988
4,RRL5,Archaea|Euryarchaeota|Halobacteria|Halobacteri...,0.00581,RRL5,RRL,6/25/2021,RRL5,0.572487,0.427513,exaltata,0.099801,0.0898,0.000613,-0.004979,0.017524


## Clean Data

In [112]:
#drop duplicate columns
df = full_df
df = df.T.drop_duplicates().T
df.shape

(277695, 13)

In [113]:
subdf = df

In [130]:
pivoted_df = subdf.pivot(index=['sample_id', 'site_id', 'species'], columns='genera', values='relative_abundance')
pivoted_df.shape

(165, 1683)

In [131]:
pivoted_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,genera,Archaea|Euryarchaeota|Halobacteria|Halobacteriales|Haloarculaceae|Halapricum,Archaea|Euryarchaeota|Halobacteria|Halobacteriales|Halococcaceae|Halococcus,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Natrinema,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Natronococcus,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Saliphagus,Archaea|Euryarchaeota|Methanobacteria|Methanobacteriales|Methanobacteriaceae|Methanobacterium,Archaea|Euryarchaeota|Methanobacteria|Methanobacteriales|Methanobacteriaceae|Methanobrevibacter,Archaea|Euryarchaeota|Thermoplasmata|Methanomassiliicoccales|Methanomassiliicoccaceae|Methanomassiliicoccus,Archaea|Thaumarchaeota|Nitrososphaeria|Nitrososphaerales|Nitrososphaeraceae|Candidatus Nitrosocosmicus,Bacteria|Abditibacteriota|Abditibacteria|Abditibacteriales|Abitibacteriaceae|Abditibacterium,...,Viruses|||Caudovirales|Siphoviridae|Np1virus,Viruses|||Caudovirales|Siphoviridae|Pa6virus,Viruses|||Caudovirales|Siphoviridae|Sextaecvirus,Viruses|||Caudovirales|Siphoviridae|Sk1virus,Viruses|||Herpesvirales|Herpesviridae|Varicellovirus,Viruses||||Baculoviridae|Alphabaculovirus,Viruses||||Baculoviridae|Betabaculovirus,Viruses||||Microviridae|Phix174microvirus,Viruses||||Mimiviridae|Cafeteriavirus,Viruses||||Partitiviridae|Betapartitivirus
sample_id,site_id,species,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
CMB10,CMB,syriaca,0.0,0.0765,0.0,0.0,0.00256,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.33369,0.0,0.0,0.28122,0.0,0.0
CMB11,CMB,syriaca,0.0,0.07147,0.0,0.0,0.00267,0.0,0.01641,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.69867,0.0,0.0,0.56396,0.0,0.0
CMB13,CMB,syriaca,0.0,0.09585,0.0,0.0,0.00428,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CMB14,CMB,syriaca,0.0,0.1262,0.0,0.0,0.00329,0.0,0.0136,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.81434,0.0,0.0,0.39777,0.0,0.0
CMB15,CMB,syriaca,0.02993,0.08792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5256,0.01059,0.0,0.0,0.0,0.0


In [132]:
row_sum = pivoted_df.sum(axis=1)

In [133]:
pivoted_df = pivoted_df.reset_index()

### Create Dataset with Values summing to 1 for each row

In [134]:
genera_col = pivoted_df.columns
genera_col = genera_col.drop(['sample_id','site_id', 'species'])

In [135]:
sub_pivot = pivoted_df[genera_col].astype(float)

In [136]:
for row in range(sub_pivot.shape[0]):
    sub_pivot.loc[row] = sub_pivot.loc[row]/row_sum[row]

In [137]:
new_sum = sub_pivot.sum(axis=1)
new_sum

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
160    1.0
161    1.0
162    1.0
163    1.0
164    1.0
Length: 165, dtype: float64

In [138]:
pivoted_df[genera_col] = sub_pivot

In [139]:
pivoted_df.head()

genera,sample_id,site_id,species,Archaea|Euryarchaeota|Halobacteria|Halobacteriales|Haloarculaceae|Halapricum,Archaea|Euryarchaeota|Halobacteria|Halobacteriales|Halococcaceae|Halococcus,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Natrinema,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Natronococcus,Archaea|Euryarchaeota|Halobacteria|Natrialbales|Natrialbaceae|Saliphagus,Archaea|Euryarchaeota|Methanobacteria|Methanobacteriales|Methanobacteriaceae|Methanobacterium,Archaea|Euryarchaeota|Methanobacteria|Methanobacteriales|Methanobacteriaceae|Methanobrevibacter,...,Viruses|||Caudovirales|Siphoviridae|Np1virus,Viruses|||Caudovirales|Siphoviridae|Pa6virus,Viruses|||Caudovirales|Siphoviridae|Sextaecvirus,Viruses|||Caudovirales|Siphoviridae|Sk1virus,Viruses|||Herpesvirales|Herpesviridae|Varicellovirus,Viruses||||Baculoviridae|Alphabaculovirus,Viruses||||Baculoviridae|Betabaculovirus,Viruses||||Microviridae|Phix174microvirus,Viruses||||Mimiviridae|Cafeteriavirus,Viruses||||Partitiviridae|Betapartitivirus
0,CMB10,CMB,syriaca,0.0,0.002321,0.0,0.0,7.8e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.010126,0.0,0.0,0.008534,0.0,0.0
1,CMB11,CMB,syriaca,0.0,0.002065,0.0,0.0,7.7e-05,0.0,0.000474,...,0.0,0.0,0.0,0.0,0.020183,0.0,0.0,0.016292,0.0,0.0
2,CMB13,CMB,syriaca,0.0,0.003318,0.0,0.0,0.000148,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CMB14,CMB,syriaca,0.0,0.004577,0.0,0.0,0.000119,0.0,0.000493,...,0.0,0.0,0.0,0.0,0.029538,0.0,0.0,0.014428,0.0,0.0
4,CMB15,CMB,syriaca,0.000963,0.002829,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016913,0.000341,0.0,0.0,0.0,0.0


In [140]:
row_sum = pivoted_df[genera_col].sum(axis=1)
print(row_sum)

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
160    1.0
161    1.0
162    1.0
163    1.0
164    1.0
Length: 165, dtype: float64


### Without further modification PCA by Site

In [141]:
X = pivoted_df[genera_col]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pivoted_df['site_id'])
fig.update_layout(
    title = "Plant Nutrients by Site Not Including Nitrogen PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

### Without further modification PCA by Species

In [142]:
X = pivoted_df[genera_col]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=pivoted_df['species'])
fig.update_layout(
    title = "Plant Nutrients by Site Not Including Nitrogen PCA",
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

### Check column sums 

In [None]:
pivoted_df[genera_col]