# Determine confounding sources of variance in genotypes
- Sequencing platform
- Ethnicity

In [78]:

config = {
    "vcfLike": {
        "path": "../adhoc analysis/Variant_report_NUPs_fixed_2022-03-28.xlsx",  # variant call table with annotations
        "sheet": "all cases vs all controls",  # sheet name if Excel spreadsheet
        "indexColumn": [
            "chrom",
            "position",
            "Gene",
        ],  # header that indexes variants (set as list with multiple columns)
        "compoundSampleIdDelimiter": "__",  # delimiter for compound sample IDs in column names
        "compoundSampleIdStartIndex": 1,  # index of first sample ID in compound sample ID
        "binarize": True,  # binarize variants to 0/1, or sum to weigh allele frequency
        "minAlleleFrequency": 0.0,  # filter out variants with allele frequency less than this
        # 'alleleModel': ['dominant', 'recessive', 'overDominant'],  # biallelic allele models to test on gene sets
        "filters": {},
    },  # TODO handle genotypes from related individuals
    "geneSets": {},  # TODO gene sets
    "tracking": {
        "name": "Nucleoporin genes",  # name of the experiment
        "entity": "ejmockler",
        "project": "ALS-NUPS-50",
        "plotAllSampleImportances": True,  # if calculating Shapely explanations, plot each sample in Neptune
    },
    "clinicalTable": {
        "path": "../adhoc analysis/ACWM.xlsx",  # clinical data as Excel spreadsheet
        "idColumn": "ExternalSampleId",  # genotype ID header
        "subjectIdColumn": "ExternalSubjectId",  # unique ID for each patient
        "labelColumn": "Subject Group",  # header that has case/control labels
        "controlLabels": [
            "Non-Neurological Control"
        ],  # these labels include external sample IDs (like 1000 Genomes)
        "caseLabels": ["ALS Spectrum MND"],  # "ALS Spectrum MND"
        "controlAlias": "control",
        "caseAlias": "case",
        "filters": "" # "pct_european>=0.85",  # filter out nonhomogenous samples with less than 85% European ancestry
    },
    "externalTables": {
        "path": [
            "../adhoc analysis/igsr-1000 genomes phase 3 release.tsv",
            # "../adhoc analysis/ALS-NUPS-2000__accurateSamples_>=97.5%.csv",
            "../adhoc analysis/ACWM_ethnicallyVariable.tsv",
            "../adhoc analysis/ACWM_ethnicallyVariable.tsv",
            #"../adhoc analysis/igsr-1000 genomes phase 3 release.tsv",
        ],  # external sample table
        "label": [
            "control",
            # "case",
            "case",
            "control",
            #"control",
        ],  # case | control
        "setType": [
            "crossval",
            # "crossval",
            "holdout",
            "holdout",
            #"holdout",
        ],
        "idColumn": [
            "Sample name",
            # "id",
            "ExternalSubjectId",
            "ExternalSubjectId",
            #"Sample name",
        ],  # sample ID header
        "filters": [
            "", # `Superpopulation code`=='EUR' & `Population name`!='Finnish'  # remove finnish samples due to unusual homogeneity (verify w/ PCA)
            #"`Subject Group`=='ALS Spectrum MND' ", # & `pct_european`<0.85
            "`Subject Group`=='none'",#`Subject Group`=='Non-Neurological Control' ", # & `pct_european`<0.85
            "`Subject Group`=='none'", # `Superpopulation code`!='EUR'
        ],
    },
    "sampling": {
        "bootstrapIterations": 50,
        "crossValIterations": 10,  # number of validations per bootstrap iteration
        "holdoutSplit": 0.1,
        "lastIteration": 50,
        "sequesteredIDs": [],
    },
    "model": {
        "hyperparameterOptimization": True,
        "calculateShapelyExplanations": False,
    },
}


In [79]:
workingProject = 'ALS-NUPS-50__1'

In [80]:
from tasks.input import processInputFiles
(
    caseGenotypes,
    caseIDs,
    holdoutCaseGenotypes,
    holdoutCaseIDs,
    controlGenotypes,
    controlIDs,
    holdoutControlGenotypes,
    holdoutControlIDs,
    clinicalData,
) = processInputFiles(config)


Unknown extension is not supported and will be removed



100%|██████████| 5417/5417 [04:19<00:00, 20.90id/s] 


100%|██████████| 3352/3352 [04:37<00:00, 12.09id/s]


## Use [PaCMAP](https://github.com/YingfanWang/PaCMAP) + PCA to visualize genotypes clusters on a 2D manifold

In [81]:
import pacmap
from sklearn.decomposition import PCA
import plotly.express as px

In [82]:
# Add a new column 'predominant_ethnicity' that takes the index of the max across the ethnicity columns
ethnicity_cols = ['pct_african', 'pct_south_asian', 'pct_east_asian', 'pct_european', 'pct_americas']

clinicalData['predominant_ethnicity'] = clinicalData[ethnicity_cols].idxmax(axis=1).str.split('pct_', n=1).str.get(1)

# keep samples with defined ethnicity
clinicalData = clinicalData.dropna(subset=ethnicity_cols)

## Cases


In [150]:
caseIndexClinical = [clinical_id 
                for clinical_id in clinicalData.index 
                if any(clinical_id in geno_id 
                       for geno_id in caseGenotypes.T.index)]
caseClinicalData = clinicalData.loc[caseIndexClinical]

caseIndexGenotypes = [case_id 
                      for case_id in caseGenotypes.T.index
                      if any(clinical_id in case_id 
                             for clinical_id in caseClinicalData.index)] 

caseGenotypesBySample = caseGenotypes.T.loc[caseIndexGenotypes].dropna(how="any", axis=1)

clinicalCaseIndexMap = {}
genoCaseIndexMap = {}
for clinical_id in caseIndexClinical:
    for geno_id in caseIndexGenotypes:
        if clinical_id in geno_id:
            clinicalCaseIndexMap[clinical_id] = geno_id
            genoCaseIndexMap[geno_id] = clinical_id

In [269]:
casePacVectors = pacmap.PaCMAP(n_components=2).fit_transform(caseGenotypesBySample)
casePcaEmbedding = PCA(n_components=2)
casePcaVectors = casePcaEmbedding.fit_transform(caseGenotypesBySample)


In [270]:
# Create a dictionary to map ethnicity to a specific color
superpopulation_color_dict = {
    'AFR': 'yellow',
    'SAS': 'green',
    'EAS': 'blue',
    'EUR': 'purple',
    'AMR': 'red',
    'FIN': 'pink',
    'EUR,AFR': 'orange',
    'african': 'yellow',
    'south_asian': 'green',
    'east_asian': 'blue',
    'european': 'purple',
    'americas': 'red'
}

# Create a list of colors based on 'predominant_ethnicity'
ethnicityColors = [superpopulation_color_dict[ethnicity] for ethnicity in clinicalData.loc[caseIndexClinical, 'predominant_ethnicity']]

In [271]:
import plotly.graph_objects as go
import pandas as pd

# Map these to the corresponding genotype embedding
# This assumes that the case_id in both dataframes are sorted in the same way
casePacVectors_df = pd.DataFrame(casePacVectors, columns=['Dimension 1', 'Dimension 2'])
casePacVectors_df.index = caseGenotypesBySample.index
casePacVectors_df['Predominant ethnicity'] = caseClinicalData["predominant_ethnicity"].values

casePcaVectors_df = pd.DataFrame(casePcaVectors, columns=['Dimension 1', 'Dimension 2'])
casePcaVectors_df.index = caseGenotypesBySample.index
casePcaVectors_df['Predominant ethnicity'] = caseClinicalData["predominant_ethnicity"].values



### Well-predicted cases

In [272]:
sampleResults = pd.read_csv('projects/ALS-NUPS-50__1/sampleResults.csv', index_col=0)
wellPredictedCaseIDs = sampleResults.query('`label` == 1 & `accuracy` >= 0.85').index.tolist()

In [273]:
casePacVectors_df['wellPredicted'] = False
casePacVectors_df.loc[wellPredictedCaseIDs, 'wellPredicted'] = True
casePacVectors_df['Sample ID'] = casePacVectors_df.index
casePacVectors_df = pd.merge(casePacVectors_df, sampleResults.loc[caseIndexGenotypes], left_on='Sample ID', right_index=True)

casePcaVectors_df['wellPredicted'] = False
casePcaVectors_df.loc[wellPredictedCaseIDs, 'wellPredicted'] = True
casePcaVectors_df['Sample ID'] = casePcaVectors_df.index
casePcaVectors_df = pd.merge(casePcaVectors_df, sampleResults.loc[caseIndexGenotypes], left_on='Sample ID', right_index=True)

In [274]:
europeanCaseIDs = casePacVectors_df.query("`Predominant ethnicity`=='european'")['Sample ID']

In [275]:
casePacVectors_df

Unnamed: 0,Dimension 1,Dimension 2,Predominant ethnicity,wellPredicted,Sample ID,label,probability,accuracy,meanProbability
ALS__CGND-HDA-04091__NEUHF998PCY,-7.355873,3.905262,european,False,ALS__CGND-HDA-04091__NEUHF998PCY,1,"[0.5 ,0.50540168,0.5307475 ,0.51351659,0...",0.663462,0.496133
aals-ALS__CGND-HDA-04089__NEUEU419NMF,-5.804260,-3.741101,european,False,aals-ALS__CGND-HDA-04089__NEUEU419NMF,1,"[0.5 ,0.53023345,0.51179315,0.5135635 ,0...",0.778689,0.553507
aals-ALS__CGND-HDA-04087__NEUAN588EBF,-5.535074,1.460904,south_asian,False,aals-ALS__CGND-HDA-04087__NEUAN588EBF,1,"[5.00000000e-01,5.00000000e-01,5.00000000e-01,...",0.506000,0.504650
aals-ALS__CGND-HDA-04086__NEUDH813DE6,15.020708,6.647500,european,False,aals-ALS__CGND-HDA-04086__NEUDH813DE6,1,"[0.5 ,0.53372823,0.50569316,0.50907434,0...",0.410714,0.474640
aals-ALS__CGND-HDA-04085__NEUXZ486GG5,1.369268,-6.335928,european,False,aals-ALS__CGND-HDA-04085__NEUXZ486GG5,1,"[0.5 ,0.51480762,0.53483574,0.48948841,0...",0.228346,0.474599
...,...,...,...,...,...,...,...,...,...
ALS__CGND-HDA-00013__UP-WGS-196,15.187097,7.181742,european,False,ALS__CGND-HDA-00013__UP-WGS-196,1,"[0.52228489,0.51301123,0.5 ,0.5 ,0...",0.453125,0.503224
ALS__CGND-HDA-00012__UP-WGS-195,-7.984699,4.678456,european,True,ALS__CGND-HDA-00012__UP-WGS-195,1,"[0.5262368 ,0.62616158,0.5 ,0.51986523,0...",0.890000,0.577834
ALS__CGND-HDA-00008__UP-WGS-191,-1.485436,-0.847084,european,False,ALS__CGND-HDA-00008__UP-WGS-191,1,"[0.48443883,0.471704 ,0.48529474,0.5 ,0...",0.324786,0.481618
ALS__CGND-HDA-00004__UP-WGS-187,-2.550874,-6.123450,european,False,ALS__CGND-HDA-00004__UP-WGS-187,1,"[0.5 ,0.50556019,0.5 ,0.5 ,0...",0.791304,0.555932


In [276]:
casePcaVectors_df

Unnamed: 0,Dimension 1,Dimension 2,Predominant ethnicity,wellPredicted,Sample ID,label,probability,accuracy,meanProbability
ALS__CGND-HDA-04091__NEUHF998PCY,-0.104743,-0.005087,european,False,ALS__CGND-HDA-04091__NEUHF998PCY,1,"[0.5 ,0.50540168,0.5307475 ,0.51351659,0...",0.663462,0.496133
aals-ALS__CGND-HDA-04089__NEUEU419NMF,-0.695960,-1.123738,european,False,aals-ALS__CGND-HDA-04089__NEUEU419NMF,1,"[0.5 ,0.53023345,0.51179315,0.5135635 ,0...",0.778689,0.553507
aals-ALS__CGND-HDA-04087__NEUAN588EBF,-0.199515,1.045066,south_asian,False,aals-ALS__CGND-HDA-04087__NEUAN588EBF,1,"[5.00000000e-01,5.00000000e-01,5.00000000e-01,...",0.506000,0.504650
aals-ALS__CGND-HDA-04086__NEUDH813DE6,1.705826,-0.628009,european,False,aals-ALS__CGND-HDA-04086__NEUDH813DE6,1,"[0.5 ,0.53372823,0.50569316,0.50907434,0...",0.410714,0.474640
aals-ALS__CGND-HDA-04085__NEUXZ486GG5,-0.551301,0.308988,european,False,aals-ALS__CGND-HDA-04085__NEUXZ486GG5,1,"[0.5 ,0.51480762,0.53483574,0.48948841,0...",0.228346,0.474599
...,...,...,...,...,...,...,...,...,...
ALS__CGND-HDA-00013__UP-WGS-196,1.642338,-0.703293,european,False,ALS__CGND-HDA-00013__UP-WGS-196,1,"[0.52228489,0.51301123,0.5 ,0.5 ,0...",0.453125,0.503224
ALS__CGND-HDA-00012__UP-WGS-195,-0.272403,0.045471,european,True,ALS__CGND-HDA-00012__UP-WGS-195,1,"[0.5262368 ,0.62616158,0.5 ,0.51986523,0...",0.890000,0.577834
ALS__CGND-HDA-00008__UP-WGS-191,-0.815837,0.176464,european,False,ALS__CGND-HDA-00008__UP-WGS-191,1,"[0.48443883,0.471704 ,0.48529474,0.5 ,0...",0.324786,0.481618
ALS__CGND-HDA-00004__UP-WGS-187,-0.659011,-1.120163,european,False,ALS__CGND-HDA-00004__UP-WGS-187,1,"[0.5 ,0.50556019,0.5 ,0.5 ,0...",0.791304,0.555932


In [277]:
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.express as px

# Create initial scatter plots with 'wellPredicted' and 'Predominant ethnicity' as color column
variantThreshold = f'{config["vcfLike"]["minAlleleFrequency"]:.1%}'
fig1 = px.scatter(casePacVectors_df.query("`Predominant ethnicity` == 'european'"), 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='accuracy',
                 hover_data=['Predominant ethnicity', 'accuracy', 'Sample ID'],
                 color_discrete_map={True: 'blue', False: 'red'},
                 title=f'PaCMAP of ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by accuracy')
fig1.show()
fig2 = px.scatter(casePacVectors_df.query("`Predominant ethnicity` == 'european'"), 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='Predominant ethnicity',
                 hover_data=['wellPredicted', 'accuracy', 'Sample ID'],
                 color_discrete_map={'european': 'purple', 'african': 'yellow', 'east_asian': 'blue', 'south_asian': 'green'},
                 title=f'PaCMAP of ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by ethnicity')
fig2.show()


In [278]:
import plotly.express as px
from plotly.subplots import make_subplots

# Create initial scatter plots with 'wellPredicted' and 'Predominant ethnicity' as color column
variantThreshold = f'{config["vcfLike"]["minAlleleFrequency"]:.1%}'
fig1 = px.scatter(casePcaVectors_df, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='accuracy',
                 symbol='Predominant ethnicity',
                 hover_data=['Predominant ethnicity', 'accuracy', 'Sample ID'],
                 color_discrete_map={True: 'blue', False: 'red'},
                 title=f'PCA of ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by accuracy')
fig1.update_layout(coloraxis_colorbar=dict(y=0.5, x=1.2,
                                          ticks="outside"))

fig2 = px.scatter(casePcaVectors_df, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='Predominant ethnicity',
                 hover_data=['wellPredicted', 'accuracy', 'Sample ID'],
                 color_discrete_map={'european': 'purple', 'african': 'yellow', 'east_asian': 'blue', 'south_asian': 'green'},
                 title=f'PCA of ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by ethnicity')


# Create subplot
fig = make_subplots(rows=2, cols=1)

# Add all traces from fig1 and fig2 to the subplot
for trace in fig1['data']:
    fig.add_trace(trace, row=1, col=1)
    
for trace in fig2['data']:
    fig.add_trace(trace, row=2, col=1)

# Update layout
fig.update_layout(coloraxis_colorbar=dict(y=0.5, x=1.2,
                                          ticks="outside"), title=f'PCA of ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF')

fig.write_html(f'./alsCasePCA.html')
# Show the figure
fig.show()


### Ethnicity-controlled cases

In [161]:
mostCommonEthnicity = casePcaVectors_df.mode()['Predominant ethnicity'][0]

In [183]:
ethnicCasePaCVectors = pacmap.PaCMAP(n_components=2).fit_transform(caseGenotypesBySample.loc[casePacVectors_df.query(f"`Predominant ethnicity` == '{mostCommonEthnicity}'").index])

ethnicCasePCAEmbedding = PCA(n_components=2)
ethnicaCasePCAVectors = ethnicCasePCAEmbedding.fit_transform(caseGenotypesBySample.loc[casePacVectors_df.query(f"`Predominant ethnicity` == '{mostCommonEthnicity}'").index])

In [189]:
import plotly.graph_objects as go
import pandas as pd

ethnicCaseIndex = [clinicalCaseIndexMap[clinicalID] for clinicalID in clinicalData.query(f"`predominant_ethnicity`=='{mostCommonEthnicity}'").index if clinicalID in clinicalCaseIndexMap]

# Map these to the corresponding genotype embedding
# This assumes that the case_id in both dataframes are sorted in the same way
ethnicPaCvectors_df = pd.DataFrame(ethnicCasePaCVectors, columns=['Dimension 1', 'Dimension 2'])
ethnicPaCvectors_df.index = caseGenotypesBySample.loc[ethnicCaseIndex].index
ethnicPaCvectors_df['Predominant ethnicity'] = caseClinicalData.loc[[genoCaseIndexMap[genoID] for genoID in ethnicPaCvectors_df.index]]["predominant_ethnicity"].values

ethnicPCAvectors_df = pd.DataFrame(ethnicaCasePCAVectors, columns=['Dimension 1', 'Dimension 2'])
ethnicPCAvectors_df.index = ethnicPaCvectors_df.index
ethnicPCAvectors_df['Predominant ethnicity'] = ethnicPaCvectors_df['Predominant ethnicity']

ethnicPaCvectors_df['Sample ID'] = ethnicPaCvectors_df.index
ethnicPaCvectors_df = pd.merge(ethnicPaCvectors_df, sampleResults.loc[ethnicCaseIndex], left_on='Sample ID', right_index=True)

ethnicPCAvectors_df['Sample ID'] = ethnicPCAvectors_df.index
ethnicPCAvectors_df = pd.merge(ethnicPCAvectors_df, sampleResults.loc[ethnicCaseIndex], left_on='Sample ID', right_index=True)

In [193]:
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.express as px

# Create initial scatter plots with 'wellPredicted' and 'Predominant ethnicity' as color column
variantThreshold = f'{config["vcfLike"]["minAlleleFrequency"]:.1%}'
fig1 = px.scatter(ethnicPCAvectors_df, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='accuracy',
                 hover_data=['Predominant ethnicity', 'accuracy', 'Sample ID'],
                 color_discrete_map={True: 'blue', False: 'red'},
                 title=f'PCA of European ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by accuracy')
fig1.show()
fig2 = px.scatter(ethnicPCAvectors_df, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='Predominant ethnicity',
                 hover_data=['accuracy', 'Sample ID'],
                 color_discrete_map={'european': 'purple', 'african': 'yellow', 'east_asian': 'blue', 'south_asian': 'green'},
                 title=f'PCA of European ALS Cases, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by ethnicity')
fig2.show()


#### Features that explain variance

In [191]:
# Dump components relations with features:
explainedPcaFeatures = pd.DataFrame(ethnicCasePCAEmbedding.components_,columns=caseGenotypesBySample.columns,index = ['PC-1','PC-2'])

In [203]:
# Compute absolute value of 'PC-2' row
abs_PC2 = explainedPcaFeatures.loc['PC-2'].abs()

# Sort DataFrame by absolute value of 'PC-2' row
explainedPcaFeatures.loc[:, abs_PC2.sort_values(ascending=False).index]


chrom,6,6,1,1,6,3,7,7,3,3,...,7,7,7,7,7,7,7,7,7,X
position,17665248,17675015,229487591,229495987,17632802,10311942,857859,849532,13379650,13358286,...,869482,873226,72925156,72925168,72925174,72925273,72925309,72925338,72925446,154380773
Gene,NUP153,NUP153,NUP133,NUP133,NUP153,SEC13,SUN1,SUN1,NUP210,NUP210,...,SUN1,SUN1,POM121,POM121,POM121,POM121,POM121,POM121,POM121,EMD
PC-1,0.091589,0.09461,-0.055145,-0.054593,-0.016292,0.006957,0.003044,0.004305,-0.471554,-0.455048,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
PC-2,-0.546527,-0.544465,0.397946,0.392512,0.167198,-0.080249,-0.079343,-0.077575,-0.070652,-0.065089,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
explainedPcaFeatures.to_csv('pcaFeatureExplanations.csv', sep='\t')

## Controls

In [92]:
controlMetadata = pd.read_csv('../adhoc analysis/igsr-1000 genomes phase 3 release.tsv', sep='\t', index_col=0)

In [93]:
controlIndexExternal = [clinical_id for clinical_id in controlMetadata.dropna(subset="Superpopulation code").index.tolist() + clinicalData.index.tolist() if any(clinical_id in geno_id for geno_id in controlGenotypes.T.index)]
controlIndexGenotype = [geno_id for geno_id in controlGenotypes.T.index if any(external_id in geno_id for external_id in controlIndexExternal)]

clinicalControlsOnly = clinicalData.loc[pd.Index(controlIndexExternal).intersection(clinicalData.index)]
controlGenotypesBySample = controlGenotypes.dropna(how="any", axis=0).T.loc[controlIndexGenotype]

clinicalControlIndexMap = {}
genoControlIndexMap = {}
for clinical_id in clinicalControlsOnly.index:
    for geno_id in controlIndexGenotype:
        if clinical_id in geno_id:
            clinicalControlIndexMap[clinical_id] = geno_id
            genoControlIndexMap[geno_id] = clinical_id
            

In [94]:
controlEmbedding = pacmap.PaCMAP(n_components=2).fit_transform(controlGenotypesBySample)

In [95]:
controlEmbedding_df = pd.DataFrame(controlEmbedding, columns=['Dimension 1', 'Dimension 2'])
controlEmbedding_df.index = controlGenotypesBySample.index
controlEmbedding_df["Predominant ethnicity"] = ''

In [199]:
controlEmbedding_df.mode()['Predominant ethnicity'][0]

'EUR'

In [200]:
ethnicNonNeuroIndex = controlEmbedding_df.query("`Predominant ethnicity` == 'EUR' | `Predominant ethnicity` == 'european'").index

In [201]:
ethnicNonNeuroIndex

Index(['CTR__CGND-HDA-04090__NEUME474REU', 'CTR__CGND-HDA-03986__NEUAN931EA2',
       'CTR__CGND-HDA-03979__NEUFH999HYR', 'CTR__CGND-HDA-03978__NEUCP958UX0',
       'CTR__CGND-HDA-03977__NEUXN602GMF', 'CTR__CGND-HDA-03967__TD-C-188',
       'CTR__CGND-HDA-03962__TD-C-183', 'CTR__CGND-HDA-03957__TD-C-178',
       'CTR__CGND-HDA-03951__TD-C-172', 'CTR__CGND-HDA-03950__TD-C-171',
       ...
       'NA20753', 'NA20758', 'NA20760', 'NA20765', 'NA20772', 'NA20796',
       'NA20804', 'NA20809', 'NA20811', 'NA20828'],
      dtype='object', length=794)

In [98]:
clinicalControlsIndex = []

In [99]:
predominant_ethnicity_mapping = {clinicalIndexMap[id]: ethnicity for id, ethnicity in clinicalControlsOnly.loc[clinicalIndexMap.keys()]["predominant_ethnicity"].to_dict().items()}

# Create a new column in controlEmbedding_df DataFrame where the index is mapped to the corresponding predominant ethnicity
controlEmbedding_df['Predominant ethnicity'] = pd.Series(predominant_ethnicity_mapping)
controlEmbedding_df.loc[pd.Index(controlIndexExternal).intersection(controlEmbedding_df.index),'Predominant ethnicity'] = controlMetadata.loc[pd.Index(controlIndexExternal).intersection(controlEmbedding_df.index), 'Superpopulation code']

In [100]:
controlEmbedding_df.dropna(how="any", inplace=True)

In [101]:
fig = px.scatter(controlEmbedding_df, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='Predominant ethnicity',
                 color_discrete_map=superpopulation_color_dict,
                 title=f"PaCMAP of Non-Neurological Controls, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by ethnicity")

fig.show()

### 1000 Genomes European subpopulations

In [102]:
europeanControlEmbedding = controlEmbedding_df.query("`Predominant ethnicity`=='EUR'")
europeanControlEmbedding['Population code'] = controlMetadata.loc[europeanControlEmbedding.index, 'Population code']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [103]:
fig = px.scatter(europeanControlEmbedding, 
                 x='Dimension 1', 
                 y='Dimension 2', 
                 color='Population code',
                 title=f"PaCMAP of 1kG European Controls, {len(caseGenotypesBySample.columns)} variants >={variantThreshold} MAF, labeled by population")

fig.show()

#### Well-predicted controls

In [337]:
sampleResults = pd.read_csv('projects/ALS-NUPS-50__1/sampleResults.csv', index_col=0)
wellPredictedControlIDs = sampleResults.query('`label` == 0 & `accuracy` >= 0.85').index.tolist()

In [348]:
europeanControlEmbedding['wellPredicted'] = False
europeanControlEmbedding.loc[pd.Index(wellPredictedControlIDs).intersection(europeanControlEmbedding.index), 'wellPredicted'] = True



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [350]:
europeanControlEmbedding

Unnamed: 0,Dimension 1,Dimension 2,Predominant ethnicity,Population code,wellPredicted
HG00271,-1.524933,-0.447940,EUR,FIN,False
HG00276,-3.086132,-0.035662,EUR,FIN,False
HG00288,-3.176813,3.019901,EUR,FIN,False
HG00290,-7.507800,2.088684,EUR,FIN,False
HG00308,-2.301198,0.268026,EUR,FIN,False
...,...,...,...,...,...
NA20796,-7.558999,-1.642017,EUR,TSI,False
NA20804,-1.045745,-0.661634,EUR,TSI,False
NA20809,-6.060828,-2.255911,EUR,TSI,False
NA20811,-6.578100,-2.491433,EUR,TSI,False


In [363]:
import plotly.express as px

# Separate dataframes for wellPredicted == True and False
df_true = europeanControlEmbedding[europeanControlEmbedding['wellPredicted'] == True]
df_false = europeanControlEmbedding[europeanControlEmbedding['wellPredicted'] == False]

# Create scatter plots for both dataframes separately
fig_true = px.scatter(df_true, 
                      x='Dimension 1', 
                      y='Dimension 2', 
                      color='Population code',
                      symbol="wellPredicted",
                      color_discrete_map=superpopulation_color_dict)

fig_false = px.scatter(df_false, 
                      x='Dimension 1', 
                      y='Dimension 2', 
                      color='Population code',
                      symbol="wellPredicted",
                      color_discrete_map=superpopulation_color_dict)

# Set opacity for wellPredicted == False scatter plot
fig_false.update_traces(marker=dict(opacity=0.25))

# Add traces from fig_false to fig_true
for trace in fig_false.data:
    fig_true.add_trace(trace)

# Set the title
fig_true.update_layout(title=f'PaCMAP of Well-Classified European Controls (>= 85% accuracy), {len(caseGenotypes)} variants, labeled by ethnicity')

# Show the plot
fig_true.show()


## Sequencing platform

In [22]:
fullVariantPath = '../adhoc analysis/als_1kg.exon.num.txt'
fullVariants_df = pd.DataFrame()
for chunk in pd.read_csv(fullVariantPath, sep='\t', header=None, chunksize=1000):
    fullVariants_df = pd.concat([fullVariants_df, chunk], ignore_index=True)


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (32) have mi

KeyboardInterrupt: 