# Use CCA to find shared variance between ontogeny and longtogeny datasets

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSCanonical, CCA

In [4]:
version = 8
folder = Path(f"/n/groups/datta/win/longtogeny/data/ontogeny/version_{version:02d}")

In [5]:
usages = pd.read_parquet(folder / "ontogeny_males_clean_v2.parquet")

In [6]:
long_usages = pd.read_parquet(folder / "longtogeny_males_clean_v1.parquet").fillna(0).astype('float32')

In [7]:
model = PLSCanonical(n_components=5)

In [None]:
## TODO: map longtogeny ages onto ontogeny ages

In [8]:
model.fit(usages, long_usages.reindex(columns=usages.columns).fillna(0))

ValueError: Found input variables with inconsistent numbers of samples: [393, 1014]

## PCA analysis projecting one onto the other

In [13]:
def col_reindex(in_df, ref_df):
    return in_df.reindex(columns=ref_df.columns, fill_value=0)

In [None]:
pca = PCA(2)
og = pca.fit_transform(usages)
proj = pca.transform(col_reindex(long_usages, usages))

fig = plt.figure()
ax = fig.gca()
ax.scatter(*og.T, s=10, label="Fit to ontogeny")
ax.scatter(*proj.T, s=4, label="Longtogeny projection")
ax.set(xlabel="PC 1", ylabel="PC 2")
plt.legend()

In [None]:
pca = PCA(2)
og = pca.fit_transform(col_reindex(long_usages, usages))
proj = pca.transform(usages)

fig = plt.figure()
ax = fig.gca()
ax.scatter(*og.T, s=4, label="Fit to longtogeny")
ax.scatter(*proj.T, s=10, label="Ontogeny projection")
ax.set(xlabel="PC 1", ylabel="PC 2")
plt.legend()