In [113]:
import pandas as pd
from pacmap import PaCMAP
from sklearn.decomposition import PCA
from umap import UMAP
import plotly.express as px

In [8]:
genotypePathMap = {
    "aalsCaucasianCases": "projects/NUPs60-AALS-rsID-rareBinned-0.005MAF/case_embedding.csv",
    "aalsCaucasianControls": "projects/NUPs60-AALS-rsID-rareBinned-0.005MAF/control_embedding.csv",
    "aalsNonCaucCases": "projects/NUPs60-AALS-rsID-rareBinned-0.005MAF/holdout/AnswerALS Cases vs. Controls (Ethnically-Variable)/AnswerALS Cases vs. Controls (Ethnically-Variable)_holdout_case_embedding.csv",
    "aalsNonCaucControls": "projects/NUPs60-AALS-rsID-rareBinned-0.005MAF/holdout/AnswerALS Cases vs. Controls (Ethnically-Variable)/AnswerALS Cases vs. Controls (Ethnically-Variable)_holdout_control_embedding.csv",
}

In [114]:
genotypeIndexCols = ['chrom', 'position', 'rsID', 'Gene']

def read_and_index_genotype_csv(path):
    return pd.read_csv(path, index_col=genotypeIndexCols)

def create_genotype_dataframe(filter_keyword):
    return pd.concat(
        axis=1,
        objs=[
            read_and_index_genotype_csv(path)
            for setName, path in genotypePathMap.items()
            if filter_keyword in setName.lower()
        ]
    ).T
    
def assign_ethnicity(df, reference_df):
    df["caucasian"] = ["caucasian" if id in reference_df.index else "non-caucasian" for id in df.index]
    return df

def perform_dimensional_reduction_and_plot(df, method_name, title, color_map=None, shape_map=None):
    if method_name == 'PCA':
        embedding = PCA(n_components=2).fit_transform(df)
    elif method_name == 'PaCMAP':
        embedding = PaCMAP(n_components=2).fit_transform(df)
    elif method_name == 'UMAP':
        embedding = UMAP(n_components=2).fit_transform(df)
    
    fig = px.scatter(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hover_data={'id': df.index},
        color=color_map,
        symbol=shape_map,
        opacity=0.5, 
        title=f"{title}"
    )
    fig.show()

## AnswerALS

In [92]:
caucasianAnswerAlsDataFrame = create_genotype_dataframe('caucasian')
nonCaucasianAnswerAlsDataFrame = create_genotype_dataframe('noncauc')

allAnswerAlsCaseDataFrame = create_genotype_dataframe('case')
allAnswerAlsCaseDataFrame = assign_ethnicity(allAnswerAlsCaseDataFrame, caucasianAnswerAlsDataFrame)

allAnswerAlsControlDataFrame = create_genotype_dataframe('control')
allAnswerAlsControlDataFrame = assign_ethnicity(allAnswerAlsControlDataFrame, caucasianAnswerAlsDataFrame)


In [118]:
perform_dimensional_reduction_and_plot(
    allAnswerAlsCaseDataFrame.loc[
        :, 
        allAnswerAlsCaseDataFrame.columns != ('caucasian', '', '', '')],
    'PaCMAP', 
    "AnswerALS Case Embedding, Caucasian vs. Non-Caucasian (PaCMAP)",
    color_map=allAnswerAlsCaseDataFrame['caucasian'])

perform_dimensional_reduction_and_plot(
    allAnswerAlsCaseDataFrame.loc[
        :, 
        allAnswerAlsCaseDataFrame.columns != ('caucasian', '', '', '')], 
    'PCA', 
    "AnswerALS Case Embedding, Caucasian vs. Non-Caucasian (PCA)",
    color_map=allAnswerAlsCaseDataFrame['caucasian'])

perform_dimensional_reduction_and_plot(
    allAnswerAlsCaseDataFrame.loc[
        :, 
        allAnswerAlsCaseDataFrame.columns != ('caucasian', '', '', '')], 
    'UMAP', 
    "AnswerALS Case Embedding, Caucasian vs. Non-Caucasian (UMAP)",
    color_map=allAnswerAlsCaseDataFrame['caucasian'])













In [117]:
perform_dimensional_reduction_and_plot(
    allAnswerAlsControlDataFrame.loc[
        :, 
        allAnswerAlsControlDataFrame.columns != ('caucasian', '', '', '')], 
    'PaCMAP', 
    "AnswerALS Control Embedding, Caucasian vs. Non-Caucasian (PaCMAP)",
    color_map=allAnswerAlsControlDataFrame['caucasian'])

perform_dimensional_reduction_and_plot(
    allAnswerAlsControlDataFrame.loc[
        :, 
        allAnswerAlsControlDataFrame.columns != ('caucasian', '', '', '')], 
    'PCA', 
    "AnswerALS Control Embedding, Caucasian vs. Non-Caucasian (PCA)",
    color_map=allAnswerAlsControlDataFrame['caucasian'])

perform_dimensional_reduction_and_plot(
    allAnswerAlsControlDataFrame.loc[
        :, 
        allAnswerAlsControlDataFrame.columns != ('caucasian', '', '', '')], 
    'UMAP', 
    "AnswerALS Control Embedding, Caucasian vs. Non-Caucasian (UMAP)",
    color_map=allAnswerAlsControlDataFrame['caucasian'])













In [107]:
answerAlsDataFrame = pd.concat([allAnswerAlsCaseDataFrame, allAnswerAlsControlDataFrame])
answerAlsDataFrame['label'] = ['case' if id in allAnswerAlsCaseDataFrame.index else 'control' for id in answerAlsDataFrame.index]

In [119]:
perform_dimensional_reduction_and_plot(
    answerAlsDataFrame.drop(['caucasian', 'label',], axis=1), 
    'PaCMAP', 
    "All AnswerALS Samples Embedding, Caucasian vs. Non-Caucasian (PaCMAP)",
    color_map=answerAlsDataFrame['caucasian'],
    #shape_map=answerAlsDataFrame['label']
    )

perform_dimensional_reduction_and_plot(
    answerAlsDataFrame.drop(['caucasian', 'label'], axis=1), 
    'PCA', 
    "All AnswerALS Samples Embedding, Caucasian vs. Non-Caucasian (PCA)",
    color_map=answerAlsDataFrame['caucasian'],
    #shape_map=answerAlsDataFrame['label']
    )

perform_dimensional_reduction_and_plot(
    answerAlsDataFrame.drop(['caucasian', 'label'], axis=1), 
    'UMAP', 
    "All AnswerALS Samples Embedding, Caucasian vs. Non-Caucasian (UMAP)",
    color_map=answerAlsDataFrame['caucasian'],
    #shape_map=answerAlsDataFrame['label']
    )



dropping on a non-lexsorted multi-index without a level parameter may impact performance.






dropping on a non-lexsorted multi-index without a level parameter may impact performance.






dropping on a non-lexsorted multi-index without a level parameter may impact performance.



