In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
tqdm.pandas()

%matplotlib inline
import matplotlib.pyplot as plt


In [13]:
data = pd.read_csv('all_variants.csv') # read the .csv files with all covid submissions
data.head()

Unnamed: 0,drop,strainname,currCollectiondate,combineMuts,currCovClade,currLineage
0,EPI_ISL_4937520,hCoV-19/Germany/BY-RKI-I-271721/2021,2021-09-16,"NSP2_M609I,NSP3_T936N,NSP3_A488S,NSP3_P1469S,N...",GK,AY.5
1,EPI_ISL_2026506,hCoV-19/Denmark/DCGC-90247/2021,2021-04-26,"NSP3_T183I,NSP3_E177D,NSP3_A890D,NSP3_I1412T,N...",GRY,B.1.1.7
2,EPI_ISL_4937523,hCoV-19/Germany/BY-RKI-I-271724/2021,2021-09-10,"NSP2_A318V,NSP2_K81N,NSP3_T64I,NSP3_A488S,NSP3...",GK,AY.122
3,EPI_ISL_10305507,hCoV-19/Australia/QLD0x00C719/2022,2022-02-09,"NSP3_L1266I,NSP3_S1265del,NSP3_K38R,NSP3_A1892...",GRA,BA.1.17
4,EPI_ISL_8049553,hCoV-19/Netherlands/GE-RUMC-000279/2021,2021-12-03,"NSP3_A488S,NSP3_P1469S,NSP3_P1228L,NSP4_V167L,...",GK,B.1.617.2


In [15]:
data.columns

Index(['drop', 'strainname', 'currCollectiondate', 'combineMuts',
       'currCovClade', 'currLineage'],
      dtype='object')

In [28]:
def only_substs(s):
    """
    Leaves only substitutions out of all the mutations of the covid variant.
    Excludes deletions and insertions. Uses regular expressions.
    Please note that mutations become sorted in a string.
    :param s: string representing mutations in the original file (separeted by a comma)
    :return: the same string without the deletions and insertions
    """
    return ','.join(sorted(re.findall(r'\w+\d?_[ACDEFGHIKLMNPQRSTVWY]\d+[ACDEFGHIKLMNPQRSTVWY]',s)))

def extract_substs(df):
    """
    Extracts substitutions out of dataset as a part of pipeline.
    :param df: dataframe with covid submissions
    :return: same dataframe only with substitutions in .combineMuts column
    """
    df['combineMuts'] = df['combineMuts'].progress_apply(only_substs)
    return df

def assign_frequencies_le(df):
    """
    Gets frequencies of each UNIQUE submission .
    Dataframe MUST possess column .var_le - labels encodings for unique variants (submissions).
    :param df: dataframe with covid submissions
    :return: same dataframe with frequencies for unique submissions as a new column
    """
    unique, counts = np.unique(df.var_le, return_counts=True)
    freq_dict = {unique[i]:counts[i] for i in range(len(unique))}
    df['frequency_le'] = df.var_le.progress_apply(lambda x: freq_dict[x])
    return df

def assign_frequencies_clade(df):
    """
    Gets frequencies of each unique clade .
    :param df: dataframe with covid submissions
    :return: same dataframe with frequencies for unique clades as a new column
    """
    unique, counts = np.unique(df.currCovClade, return_counts=True)
    freq_dict = {unique[i]:counts[i] for i in range(len(unique))}
    df['frequency_clade'] = df.currCovClade.progress_apply(lambda x: freq_dict[x])
    return df

def assign_frequencies_lineage(df):
    """
    Gets frequencies of each unique lineage .
    :param df: dataframe with covid submissions
    :return: same dataframe with frequencies for unique lineages as a new column
    """
    unique, counts = np.unique(df.currLineage, return_counts=True)
    freq_dict = {unique[i]:counts[i] for i in range(len(unique))}
    df['frequency_lineage'] = df.currLineage.progress_apply(lambda x: freq_dict[x])
    return df

def drop_unassigned_lineages(df):
    """
    Drops submissions with unassigned lineages.
    :param df: dataframe with covid submissions
    :return: same dataframe without submissions with unassigned lineages
    """
    return df[df.currLineage != 'Unassigned']


In [30]:
df = (data
        [['drop','combineMuts', 'currCollectiondate','currCovClade', 'currLineage']] # choose specific columns
        .dropna() # prop rows with nan values
        #.loc[:1000,:]#[:8000000,0]
        .pipe(drop_unassigned_lineages)
        .sort_values(by='currCollectiondate') # sort df by collection date
        .pipe(assign_frequencies_clade)
        .pipe(assign_frequencies_lineage)
        .assign(n_muts=lambda df_: df_.combineMuts.apply(lambda x: x.count(','))) # count number of mutations for each submission
        # please note that for deletions and insertion actual number of mutations could be grater
        .pipe(extract_substs)
        .assign(n_substs=lambda df_: df_.combineMuts.apply(lambda x: x.count(','))) # count number of substitutions for each submission
        .assign(var_le=lambda df_: pd.factorize(df_.combineMuts)[0]) # assign label encodings for unique variants
        .pipe(assign_frequencies_le)
)
df

100%|██████████| 11651706/11651706 [00:14<00:00, 782556.08it/s] 
100%|██████████| 11651706/11651706 [00:14<00:00, 777479.34it/s] 
100%|██████████| 11651706/11651706 [13:33<00:00, 14318.47it/s]
100%|██████████| 11651706/11651706 [00:20<00:00, 554944.17it/s]


Unnamed: 0,drop,combineMuts,currCollectiondate,currCovClade,currLineage,frequency_clade,frequency_lineage,n_muts,n_substs,var_le,frequency_le
10593101,EPI_ISL_402123,"NSP3_N1890S,NSP4_F145I",2019-12-24,L,B,5182,9349,1,1,0,1
10592503,EPI_ISL_406798,"NSP3_L1417I,NSP6_N264K",2019-12-26,L,B,5182,9349,1,1,1,1
5968836,EPI_ISL_402130,"NSP3_D1761A,NSP4_T327I",2019-12-30,L,B,5182,9349,2,1,2,1
5968855,EPI_ISL_403930,NSP3_I1426T,2019-12-30,L,B,5182,9349,1,0,3,1
11815418,EPI_ISL_402132,Spike_F32I,2019-12-30,L,B,5182,9349,0,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...
11790435,EPI_ISL_13716820,"E_T9I,M_A63T,M_D3N,M_Q19E,NS3_T223I,NSP12_P323...",2022-07-08,GRA,BA.5.1,4517344,32367,57,45,4706071,84
11790434,EPI_ISL_13716823,"E_T9I,M_A63T,M_D3N,M_Q19E,NS3_T223I,NSP12_P323...",2022-07-08,GRA,BA.5.1,4517344,32367,57,45,4706071,84
11790433,EPI_ISL_13716822,"E_T9I,M_A63T,M_D3N,M_Q19E,NS3_T223I,NS3_V112F,...",2022-07-08,GRA,BA.5.1,4517344,32367,59,47,4984811,1
11790438,EPI_ISL_13716817,"E_T9I,M_A63T,M_D3N,M_Q19E,NS3_T223I,NS8_V5I,NS...",2022-07-08,GRA,BA.5.2.1,4517344,20593,59,47,4984812,1


In [31]:
df.to_csv('score_date_substs_freq.csv', index=False) # save new .csv

## Prep for FGAN

In [32]:
proteins = ['E', 'M', 'N', 'NS3', 'NS6', 'NS7a', 'NS7b', 'NS8', 'NSP1', 'NSP10',
            'NSP12', 'NSP13', 'NSP14', 'NSP15', 'NSP16', 'NSP2', 'NSP3', 'NSP4',
            'NSP5', 'NSP6', 'NSP7', 'NSP8', 'NSP9', 'Spike']

prot_le = {'E': 0, 'M': 1, 'N': 2, 'NS3': 3, 'NS6': 4, 'NS7a': 5, 'NS7b': 6, 'NS8': 7,
            'NSP1': 8, 'NSP10': 9, 'NSP12': 10, 'NSP13': 11, 'NSP14': 12, 'NSP15': 13,
            'NSP16': 14, 'NSP2': 15, 'NSP3': 16, 'NSP4': 17, 'NSP5': 18, 'NSP6': 19,
            'NSP7': 20, 'NSP8': 21, 'NSP9': 22, 'Spike': 23}

amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

aa_le = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11,
          'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}

prot_ldec = {v: k for k, v in prot_le.items()}
aa_ldec = {v: k for k, v in aa_le.items()}

def pre_encode(s, prot_le_, aa_le_):
    prot, subst = s.split('_')
    return np.array([prot_le_[prot], int(subst[1:-1]), aa_le_[subst[-1]]])


def load_numpy(arr_dir):
    with open(arr_dir, 'rb') as f:
        data = np.load(f)
    return data

def save_numpy(arr, arr_dir):
    with open(arr_dir, 'wb') as f:
        np.save(f, arr)

In [35]:
data = pd.read_csv('score_date_substs_freq.csv')
data.head()

Unnamed: 0,drop,combineMuts,currCollectiondate,currCovClade,currLineage,frequency_clade,frequency_lineage,n_muts,n_substs,var_le,frequency_le
0,EPI_ISL_402123,"NSP3_N1890S,NSP4_F145I",2019-12-24,L,B,5182,9349,1,1,0,1
1,EPI_ISL_406798,"NSP3_L1417I,NSP6_N264K",2019-12-26,L,B,5182,9349,1,1,1,1
2,EPI_ISL_402130,"NSP3_D1761A,NSP4_T327I",2019-12-30,L,B,5182,9349,2,1,2,1
3,EPI_ISL_403930,NSP3_I1426T,2019-12-30,L,B,5182,9349,1,0,3,1
4,EPI_ISL_402132,Spike_F32I,2019-12-30,L,B,5182,9349,0,0,4,1


In [74]:
df= (data
        .loc[:1000,:]#[:8000000,0]
        [data.currCollectiondate > '2020-06-00']
        [data.currCollectiondate < '2021-00-00']
        [data.n_substs < 50]
        [df.columns[:-1]]
        .pipe(assign_frequencies)
        .dropna()
)

  df= (data
100%|███████████████████████████████| 539784/539784 [00:02<00:00, 246549.10it/s]


In [81]:
df[df.columns[-3:]].to_csv('variants_1_meta.csv', index=False)

In [77]:
df_good = (df
           [['combineMuts']]
)

arr = np.zeros((len(df_good), 50, 3))
for i, variant in enumerate(df_good.values):
    for j, mut in enumerate(variant[0].split(',')):
        arr[i,j] = pre_encode(mut, prot_le, aa_le)
        
save_numpy(arr, 'variants_1.npy')