In [210]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import MDAnalysis as mda
from utils import utils
import sbmlcore
import json
import sys
import os
import gumpy
import numpy as np

from catomatic.CatalogueBuilder import BuildCatalogue
import matplotlib.pyplot as plt

sys.path.append(os.path.abspath('ml-models'))
from mlmodels.Models import Models


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


This notebook generates the figures in the paper 'Predicting rifampicin resistance in M. tuberculosis using machine learning informed by protein structural and chemical features.'

Authors:
- Charlotte Lynch
- Dylan Adlard
- Philip Fowler

# Data Preperation - positions and labels

Need to create a dataset of mutations with associated phenotypes from CRypTIC tables

#### Data import

In [211]:
phenos = pd.read_pickle('data/tables/cryptic-tables-v2.1.2/DST_MEASUREMENTS.pkl.gz').reset_index()
genomes = pd.read_pickle('data/tables/cryptic-tables-v2.1.2/GENOMES.pkl.gz').reset_index()

if not os.path.exists("./data/tables/cryptic-tables-v2.1.2/rpoB_mutations.csv"):
    mutations = pd.read_pickle('data/tables/cryptic-tables-v2.1.2/MUTATIONS.pkl.gz').reset_index()
    mutations = mutations[mutations.GENE.isin(['rpoA', 'rpoB', 'rpoC'])]
    mutations.to_csv("./data/tables/cryptic-tables-v2.1.2/rpoB_mutations.csv")

mutations = pd.read_csv("./data/tables/cryptic-tables-v2.1.2/rpoB_mutations.csv")

### Prepare phenotypes df 

In [212]:
#filter for rifampicin phyenotypes
phenos = phenos[
        (phenos.DRUG == 'RIF') & (phenos.PHENOTYPE.isin(["R", "S"]))
    ]
# drop duplicates (keep R row, otherwise keep first)
phenos = phenos.groupby("UNIQUEID", group_keys=False).apply(utils.filter_multiple_phenos).reset_index(drop=True)

#generate phenotype dataframes for samples with sequences
samples = pd.merge(genomes, phenos, how='inner', on='UNIQUEID')
#filter out low quality phenotypes - we don't want to introduce false positives
samples = samples[(samples.QUALITY.isin(["HIGH", "MEDIUM"]))]



  phenos = phenos.groupby("UNIQUEID", group_keys=False).apply(utils.filter_multiple_phenos).reset_index(drop=True)


### Prepare mutations df

In [213]:
mutations["GENE_MUT"] = [
    f"{row['GENE']}@{row['MINOR_MUTATION'] if row['IS_MINOR_ALLELE'] else row['MUTATION']}"
    for _, row in mutations.iterrows()
]
mutations["IS_SYNONYMOUS"] = [
    row["MUTATION"][0] == row["MUTATION"][-1] for _, row in mutations.iterrows()
]
mutations["FRS"] = [
    1 if ~mutations["IS_MINOR_ALLELE"][i] else mutations["FRS"][i]
    for i in mutations.index
]
mutations["IS_NULL"] = mutations["MUTATION"].apply(
    lambda x: x.split("@")[-1][-1]  == 'Z'
)

mutations = mutations[(~mutations.IS_SYNONYMOUS)&(~mutations.IS_NULL)]

#define solos as samples with only one mutation in rpoB (but can have multiple across genes)
mutations = mutations[mutations.GENE=='rpoB']


### import wildcards

In [214]:
#these are the wildcard rules used by catomatic to generate a piezo-compatible catalogue, which we will use as our training data
with open('./data/temp/wildcards.json', 'r') as f:
    wildcards = json.load(f)

# Build dataframe for entire rpoB dataset

This will contain all genotype-phenotype matched samples and their mutations for rifampicin - its the 'wider dataset'.

In [215]:
all_matched = pd.merge(samples, mutations, how='left', on=['UNIQUEID'])
all_matched.to_csv('./data/tables/generated/all_matched.csv')

In [216]:
print ('total number of isolates with rif phenothpes:', all_matched.UNIQUEID.nunique())
print ('total number of isolates with rif phenotypes and mutations in rpoB:', all_matched[~all_matched.MUTATION.isna()].UNIQUEID.nunique())
print ('total number of isolates with R rif phenotypes and mutations in rpoB:', all_matched[(~all_matched.MUTATION.isna())&(all_matched.PHENOTYPE=='R')].UNIQUEID.nunique())
print ('total number of isolates with S rif phenotypes and mutations in rpoB:', all_matched[(~all_matched.MUTATION.isna())&(all_matched.PHENOTYPE=='S')].UNIQUEID.nunique())


total number of isolates with rif phenothpes: 30973
total number of isolates with rif phenotypes and mutations in rpoB: 14523
total number of isolates with R rif phenotypes and mutations in rpoB: 10058
total number of isolates with S rif phenotypes and mutations in rpoB: 4465


# Build dataframe of all solos

Helpful to reference all samples, not just the ML dataset which will contain unique mutations only.

In [217]:
all_solos = pd.merge(samples, mutations.drop_duplicates(subset=['UNIQUEID'], keep=False), how='inner', on=['UNIQUEID'])
all_solos['segid'] = 'C'
all_solos.to_csv('./data/tables/generated/rpoB_solos.csv')

# Build a catalogue using catomatic

In [218]:
catalogue = BuildCatalogue(
    samples=samples,
    mutations=mutations,
    FRS=0.1,
    record_ids=True,
    tails='one',
    test='Binomial',
    p=0.95,
    background=0.05,
    seed=['C66T', 'G86A', 'E639D'],
    strict_unlock=True
)

catalogue.to_piezo("NC_000962.3", f"RIF-2024.07", "1.1", "RIF", wildcards, './data/tables/generated/catalogue.csv')
catalogue = catalogue.build_piezo("NC_000962.3", f"RIF-2024.07", "1.1", "RIF", wildcards)


In [219]:
catalogue.PREDICTION.value_counts()

PREDICTION
S    266
R     74
U      5
Name: count, dtype: int64

In [220]:
# Convert evidence entries to dicts, and remove wildcard rows
evidence = catalogue['EVIDENCE'].apply(utils.str_to_dict)
#rename columns to match naming convention in `ml-models`
df = catalogue[['MUTATION', 'PREDICTION']].rename(columns={'MUTATION':'mutation', 'PREDICTION':'phenotype'})
#filter for point mutations only
df = df[~df['mutation'].str.contains(r'_|\!|\*')]
#generate resid and segid columns
df['resid'] = [int(i[1:-1]) for i in df.mutation]
df['segid'] = 'C'
#filter for crystallised pdb region
df = df[(df['resid']>=22) & (df['resid']<=1147)]

In [221]:
df.phenotype.value_counts()

phenotype
S    221
R     46
U      0
Name: count, dtype: int64

In [222]:
df

Unnamed: 0,mutation,phenotype,resid,segid
0,C66T,S,66,C
1,G86A,S,86,C
2,E639D,S,639,C
4,V113I,S,113,C
5,G642S,S,642,C
...,...,...,...,...
330,H445F,R,445,C
334,M434I,R,434,C
335,V262A,R,262,C
337,S582A,R,582,C


# Feature engineering

We have a table of mutations and their labels. Next step is to generate a global feature set for each mutation using sbmlcore.

In [223]:
features = sbmlcore.FeatureDataset(df, protein="RNAP", species="M. tuberculosis", gene="rpoB")

#### Generate amino acid features

In [224]:
#change in amino acid volume on mutation
volume = sbmlcore.AminoAcidVolumeChange()
#change in amino acid hydropathy on mutation
hydropathy = sbmlcore.AminoAcidHydropathyChangeKyteDoolittle()
#change in amino acid molecular weight on mutation
mw = sbmlcore.AminoAcidMWChange()
#change in amino acid isoelectric point on mutation
pi = sbmlcore.AminoAcidPiChange()
#change in residue-environment similarity
rogov = sbmlcore.AminoAcidRogovChange()

features.add_feature([volume, hydropathy, mw, pi, rogov])

#### Generate distance-based features

In [225]:
#distance from each residue to rifampicin
rif_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','resname RFP', 'Rif_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the magnesium ion
mg_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','resname MG', 'Mg_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the first zinc ion
zn1_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','index 26082 and resname ZN', 'Zn1_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the second zinc ion
zn2_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','index 26083 and resname ZN', 'Zn2_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the antisense DNA strand
antisense_p_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','segid G and name P', 'antisense_P_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the coding DNA strand
sense_p_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','segid H and name P', 'sense_P_distance', infer_masses=True, offsets = {"C": -6})
#distance from each residue to the crystallised mRNA molecule
rna_distance = sbmlcore.StructuralDistances('./data/pdb/5uh6.pdb','segid I', 'RNA_distance', infer_masses=True, offsets = {"C": -6})

features.add_feature([rif_distance, mg_distance, zn1_distance, zn2_distance, antisense_p_distance, sense_p_distance, rna_distance])

####  Generate structure-related features

In [226]:
# use stride to assign secondary structure to each residue
stride = sbmlcore.Stride("./data/pdb/5uh6-peptide-only.pdb", offsets={'A': 0, 'B': 0, 'C':-6, 'D':0, 'E':0, 'F':0})
# use freeSASA to calculate solvent accessible surface area of each residue
freesasa = sbmlcore.FreeSASA("./data/pdb/5uh6.pdb", offsets = {'C':-6})
# use SNAP2 to predict functional perturbation for each mutation (in reality just adding pre-generated feature to df)
snap2 = sbmlcore.SNAP2("./data/stride/5uh6-complete.csv", offsets={'A': 0, 'B': 0, 'C':-6, 'D':0, 'E':0, 'F':0})
# use deepddg to predict change in stability on mutation
deepddg = sbmlcore.DeepDDG("./data/ddg/5uh6.ddg", offsets={'A': 0, 'B': 0, 'C':-6, 'D':0, 'E':0, 'F':0})
# use rasp to predict change in stability on mutation
rasp = sbmlcore.RaSP("./data/rasp/cavity_pred_5uh6_C.csv", offsets = {'C':-6})
# extract temperature factors from pdb file and add to df
temp = sbmlcore.TempFactors("./data/pdb/5uh6.pdb", offsets={'A': 0, 'B': 0, 'C':-6, 'D':0, 'E':0, 'F':0})

features.add_feature([stride, freesasa, snap2, deepddg, rasp, temp])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  snap2_df['Expected Accuracy'].replace(to_replace='%', value='', regex=True, inplace=True)


### Generate dynamic distance features

In [227]:
#minimum distance to rifampicin (excluding percentile tails)
rif_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'resname RFP', 'Rif_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to the magnesium ion (excluding percentile tails)
mg_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'resname MG', 'Mg_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to the first zinc ion (excluding percentile tails)
zn1_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'resid 1283 and resname ZN', 'Zn1_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to the second zinc ion (excluding percentile tails)
zn2_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'resid 1284 and resname ZN', 'Zn2_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to the nearest antisensne base (excluding percentile tails)
antisense_P_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'index 50737:51209 and name P', 'antisense_P_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to nearest sense base (excluding percentile tails)
sense_P_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'index 51210:51946 and name P', 'sense_P_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#minimum distance to the mRNA molecule (excluding percentile tails)
rna_min_distance = sbmlcore.TrajectoryDistances('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'resname G or resname A', 'RNA_min_distance', distance_type='min', offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)

features.add_feature([rif_min_distance, mg_min_distance, zn1_min_distance, zn2_min_distance, antisense_P_min_distance, rna_min_distance])

### Generate dynamic angle features

In [228]:
#mean phi angle between each residue
phi_mean = sbmlcore.TrajectoryDihedrals('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc','./data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'phi', 'mean_phi', angle_type='mean', add_bonds=True, offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)
#mean psi angle between each residue
psi_mean = sbmlcore.TrajectoryDihedrals('./data/md_files/rpob-5uh6-3-warm.gro', ['./data/md_files/rpob-5uh6-3-md-1-50ns-dt1ns-nojump.xtc', './data/md_files/rpob-5uh6-3-md-2-50ns-dt1ns-nojump.xtc','./data/md_files/rpob-5uh6-3-md-3-50ns-dt1ns-nojump.xtc'], './data/pdb/5uh6.pdb', 'psi', 'mean_psi', angle_type='mean', add_bonds=True, offsets = {'A':0, 'B':0, 'C':-6, 'D':0, 'E':0, 'F':0}, percentile_exclusion=True)

features.add_feature([phi_mean, psi_mean])

#### Clean up

The pdb residue and reference sequence used by CRyPTIC do not match for 2 mutations in our dataset:


In [229]:
features.df[features.df['T'].isna()]

Unnamed: 0,segid,resid,mutation,phenotype,d_volume,d_hydropathy_KD,d_MW,d_Pi,d_rogov,Rif_distance,...,rasp_score_ml,temp_factor,Rif_min_distance,Mg_min_distance,Zn1_min_distance,Zn2_min_distance,antisense_P_min_distance,RNA_min_distance,mean_phi,mean_psi
0,C,66,C66T,S,7.6,-3.2,-2.1,0.53,-0.156,,...,,,,,,,,,,
1,C,86,G86A,S,28.5,2.2,14.0,0.03,-0.023,,...,,,,,,,,,,


These residues do not exsit at that position in the pdb - as the pdb sequence is the reference (as our entire feature set derives from the pdb), we need to filter these mutations out

In [230]:
features.df = features.df.dropna(subset=['T'])

In [231]:
#annotate secondary structures with integer codes
features.df['secondary_structure_codes'] = pd.Categorical(features.df.secondary_structure, categories=features.df.secondary_structure.unique()).codes
features.df = features.df.drop(columns=["secondary_structure","secondary_structure_long","B","C","E","G","H","T"])
features.df = features.df.drop(columns=["residue_sasa","snap2_accuracy","rasp_wt_nlf","rasp_mt_nlf","rasp_score_ml"])
#rename columns
features.df = features.df.rename(columns={"d_Pi":"d_pi", "rasp_score_ml_fermi":"rasp_score", "secondary_structure_codes":"secondary_structure"})

df = features.df.copy()

#replace R/S labels with binary 1/0
df['phenotype'] = [1 if i =='R' else 0 for i in df.phenotype]

df.to_csv('./data/tables/generated/features_dataset.csv')


df

Unnamed: 0,segid,resid,mutation,phenotype,d_volume,d_hydropathy_KD,d_MW,d_pi,d_rogov,Rif_distance,...,temp_factor,Rif_min_distance,Mg_min_distance,Zn1_min_distance,Zn2_min_distance,antisense_P_min_distance,RNA_min_distance,mean_phi,mean_psi,secondary_structure
2,C,639,E639D,0,-27.3,0.0,-14.0,-0.45,0.109,34.015193,...,41.400002,32.398593,46.038218,86.959443,72.279360,51.151947,42.275598,-93.369233,-20.024702,0
3,C,113,V113I,0,26.7,0.3,14.1,0.06,0.494,24.626817,...,36.919998,21.736677,39.482342,75.763734,53.560955,39.285029,33.699031,-60.411471,-33.459681,1
4,C,642,G642S,0,28.9,-0.4,30.0,-0.29,0.120,42.448172,...,32.490002,40.799796,51.833114,94.069735,78.792626,59.842361,49.009863,22.946413,-155.697465,2
5,C,751,I751V,0,-26.7,-0.3,-14.1,-0.06,0.494,27.986912,...,19.900000,24.936656,37.996169,64.407775,34.255279,35.106592,32.315447,-144.714133,148.649884,2
6,C,944,K944N,0,-54.5,0.4,-14.1,-4.33,0.197,59.941562,...,86.489998,54.022245,49.511935,88.429960,106.835526,69.617671,52.185349,55.856385,20.543887,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,C,445,H445F,1,36.7,6.0,10.0,-2.11,-0.016,9.683248,...,16.459999,7.818147,24.649392,61.053218,60.018633,25.919643,19.380805,-73.606848,-39.475197,1
263,C,434,M434I,1,3.8,2.6,-18.0,0.28,-0.452,9.843524,...,17.299999,6.959772,25.525949,61.429328,53.765220,25.613549,19.405567,-74.641870,137.553994,3
264,C,262,V262A,1,-51.4,-2.4,-28.0,0.04,0.232,56.595298,...,133.350006,56.504732,68.031156,62.596191,74.290668,45.068224,64.342506,-123.647188,-17.394655,3
265,C,582,S582A,1,-0.4,2.6,-16.0,0.32,0.249,18.638826,...,18.350000,15.642925,23.524223,61.807004,71.149683,30.284239,21.147248,-94.405096,114.374256,0


In [232]:
df.columns

Index(['segid', 'resid', 'mutation', 'phenotype', 'd_volume',
       'd_hydropathy_KD', 'd_MW', 'd_pi', 'd_rogov', 'Rif_distance',
       'Mg_distance', 'Zn1_distance', 'Zn2_distance', 'antisense_P_distance',
       'sense_P_distance', 'RNA_distance', 'phi', 'psi', 'n_hbond_acceptors',
       'n_hbond_donors', 'SASA', 'snap2_score', 'deep_ddG', 'rasp_score',
       'temp_factor', 'Rif_min_distance', 'Mg_min_distance',
       'Zn1_min_distance', 'Zn2_min_distance', 'antisense_P_min_distance',
       'RNA_min_distance', 'mean_phi', 'mean_psi', 'secondary_structure'],
      dtype='object')

In [233]:
catalogue[catalogue.MUTATION=='L731P']

Unnamed: 0,GENBANK_REFERENCE,CATALOGUE_NAME,CATALOGUE_VERSION,CATALOGUE_GRAMMAR,PREDICTION_VALUES,DRUG,MUTATION,PREDICTION,SOURCE,EVIDENCE,OTHER
324,NC_000962.3,RIF-2024.07,1.1,GARC1,RUS,RIF,L731P,R,{},"[{""proportion"": 1.0, ""confidence"": [0.34238022...",{}


In [234]:
### H1

# What have we missed?

We should determine what samples and variants our method has not been able to capture - these could potentially be used to construct an independent validation set.

### Which mutations were not captured?


In [235]:
all_matched = all_matched.dropna(subset=['MUTATION'])
print ('number of uncatalogued missense mutations', all_matched[(~all_matched.MUTATION.isin(df.mutation))&(~all_matched.MUTATION.str.contains(r'_|\!|\*|\-'))&(all_matched.AMINO_ACID_NUMBER>=22)&(all_matched.AMINO_ACID_NUMBER<=1147)].MUTATION.nunique())

number of uncatalogued missense mutations 611


### Which suitable samples were not captured?

In [236]:
all_matched = all_matched.dropna(subset=['MUTATION'])

# Filter out mutations that are in df where phenotype == 1, contain unwanted characters, and meet amino acid number criteria
filtered_mutations = all_matched[(~all_matched.MUTATION.isin(df[df.phenotype == 1].mutation)) & 
                                 (~all_matched.MUTATION.str.contains(r'_|\!|\*|\-')) & 
                                 (all_matched.AMINO_ACID_NUMBER >= 22) & 
                                 (all_matched.AMINO_ACID_NUMBER <= 1147)]

# Group by UNIQUEID and filter groups to ensure at least one mutation is not in df
def filter_samples(group):
    if len(group) > 1:
        return group[~group.MUTATION.isin(df[df.phenotype == 1].mutation)].shape[0] > 0
    return False

valid_samples = filtered_mutations.groupby('UNIQUEID').filter(filter_samples)

# Get the number of unique samples that meet the criteria
num_unique_samples = valid_samples['UNIQUEID'].nunique()

In [237]:
valid_samples.drop_duplicates('UNIQUEID', keep='first').PHENOTYPE.value_counts()

PHENOTYPE
R    101
S     63
Name: count, dtype: int64

After filtering out samples that contain lof mutations + mutations not in the pdb region + already catalogue R in the catalogue, we are leve with 164 samples.... can we use these for validation? Should try out.