In [67]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [68]:
# Connect to the database. If it does not exist, it will be created
conn = sqlite3.connect(r'C:\Users\15404\Documents\GitHub\research_project\sql_db\test3.db')
# Create a cursor object. This is how we interact with the database
curs = conn.cursor()


### IMPORTANT!!! ###
# By default, sqlite does not enforce foreign key constraints. 
# According to the documentation, this is for backwards compatibility. You have to turn them on yourself.
curs.execute('PRAGMA foreign_keys=ON;')

finaldf = pd.read_sql("""SELECT * FROM tSoilNutrients
                      JOIN tSample ON tSoilNutrients.sample_id = tSample.sample_id;""", conn)

conn.close()

In [69]:
finaldf.shape

(368, 20)

In [70]:
#drop duplicate columns
finaldf = finaldf.T.drop_duplicates().T
finaldf.shape

(368, 19)

In [71]:
df = finaldf

In [72]:
nutrients = df.columns
nutrients = nutrients.drop(['sample_id', 'site_id', 'collect_date'])
print(nutrients)

Index(['OM', 'ENR', 'CEC', 'pH', 'BpH', 'Phosphorus', 'Potassium', 'Calcium',
       'Magnesium', 'Sulfur', 'Sodium', 'Zinc', 'Manganese', 'Iron', 'Copper',
       'Boron'],
      dtype='object')


## Unscaled Data

In [73]:
X = df[nutrients]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['site_id'])
fig.show()

In [74]:
pca.explained_variance_ratio_ 

array([0.9757503 , 0.01028704])

In [75]:
## Variance Ratio is wayyyy off

In [76]:
df.head()

Unnamed: 0,sample_id,OM,ENR,CEC,pH,BpH,Phosphorus,Potassium,Calcium,Magnesium,Sulfur,Sodium,Zinc,Manganese,Iron,Copper,Boron,site_id,collect_date
0,BLDTT0,4.3,124,8.1,6.8,6.91,18,180,1193,178,3,7,2.3,321,78,1.5,0.5,BLD,6/29/2020
1,BLDTT1,4.8,132,9.5,6.8,6.9,14,158,1375,226,4,9,2.5,310,89,1.5,0.7,BLD,6/29/2020
2,BLDTT2,4.9,135,9.0,6.8,6.9,15,177,1335,188,3,7,3.0,308,68,1.4,0.6,BLD,6/29/2020
3,BLDTT3,5.6,147,10.2,7.0,6.93,16,249,1498,247,5,8,3.1,328,70,1.5,0.8,BLD,6/29/2020
4,BLDTT4,5.4,145,9.1,6.9,6.92,13,143,1284,266,8,9,3.0,324,84,1.8,0.7,BLD,6/29/2020


In [77]:
df[nutrients] = StandardScaler().fit_transform(df[nutrients])  
df.head()

Unnamed: 0,sample_id,OM,ENR,CEC,pH,BpH,Phosphorus,Potassium,Calcium,Magnesium,Sulfur,Sodium,Zinc,Manganese,Iron,Copper,Boron,site_id,collect_date
0,BLDTT0,-0.767038,-0.762875,-0.495412,1.2372,1.0942,-0.630036,0.545165,-0.217367,0.0916988,-1.2226,-1.48006,-0.584115,4.23305,-0.850068,0.0933555,0.038933,BLD,6/29/2020
1,BLDTT1,-0.717554,-0.373569,-0.210168,1.2372,1.02414,-0.723866,0.242216,-0.0142619,0.581498,-1.06952,-1.03104,-0.552467,4.04794,-0.534633,0.0933555,0.690176,BLD,6/29/2020
2,BLDTT2,-0.707657,-0.22758,-0.312041,1.2372,1.02414,-0.700409,0.503853,-0.0589003,0.19374,-1.2226,-1.48006,-0.473347,4.01428,-1.13683,0.00616052,0.364554,BLD,6/29/2020
3,BLDTT3,-0.638378,0.356378,-0.0675461,1.52436,1.2343,-0.676951,1.49532,0.123001,0.795786,-0.916432,-1.25555,-0.457523,4.35084,-1.07947,0.0933555,1.0158,BLD,6/29/2020
4,BLDTT4,-0.658172,0.259052,-0.291666,1.38078,1.16425,-0.747324,0.0356608,-0.115814,0.989665,-0.457176,-1.03104,-0.473347,4.28353,-0.678013,0.35494,0.690176,BLD,6/29/2020


In [78]:
X = df[nutrients]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['site_id'])
fig.show()

In [79]:
pca.explained_variance_ratio_ 

array([0.28824126, 0.2194462 ])

In [80]:
df = df[df["site_id"].isin(['RRL', 'CMB', 'SLG', 'LFS', 'PTW', 'SGC', 'RF', 'FRW', 'MMP','RGT', 'LM', 'MKP', 'PNR'])]

In [81]:
X = df[nutrients]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['site_id'])
fig.update_layout(
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()

In [82]:
pca.explained_variance_ratio_

array([0.31256698, 0.24997927])

In [83]:
newdf = pd.read_csv('./Casey_Data_Analysis/batch_soil_csv - PCA.csv')

In [84]:
df = newdf

In [85]:
df.head()

Unnamed: 0,Species,Site,Organic_Matter,pH,Phosphorus,Potassium,Calcium,Magnesium,Sulfur,Sodium,Zinc,Manganese,Iron,Copper,Boron
0,exaltata,RRL,14.2,6.2,7,138,788,133,6,17,2.4,97,62,1.6,0.3
1,exaltata,RRL,13.8,5.9,8,132,935,146,8,19,3.1,77,94,2.8,0.2
2,exaltata,RRL,12.4,5.4,7,184,646,102,11,17,1.9,74,75,2.7,0.2
3,exaltata,RRL,14.7,6.0,9,172,824,133,7,16,2.6,87,76,1.8,0.2
4,exaltata,RRL,12.6,6.2,7,134,1028,151,8,19,2.7,94,90,1.9,0.3


In [86]:
nutrients = df.columns
nutrients = nutrients.drop(['Species','Site'])

In [87]:
df[nutrients] = StandardScaler().fit_transform(df[nutrients])  

X = df[nutrients]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['Site'])
fig.update_layout(
    xaxis_title="PC1 " + str(round(100*pca.explained_variance_ratio_[0], 1)) + '%',
    yaxis_title="PC2 " + str(round(100*pca.explained_variance_ratio_[1], 1)) + '%')
fig.show()