In [147]:
import pandas as pd
import itertools as it

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import ProteinAlphabet
from collections import Counter
from copy import deepcopy

%matplotlib inline

In [99]:
# Read in the genotype-phenotype data
data = pd.read_csv('hiv-protease-data.csv', index_col='SeqID')
seq_cols = ['P{0}'.format(i) for i in range(1,100)]
drug_cols = data.columns[0:8]
data.replace('.', '-', inplace=True)
data[seq_cols].head()

  mask = arr == x


Unnamed: 0_level_0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P90,P91,P92,P93,P94,P95,P96,P97,P98,P99
SeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2996,-,-,-,-,-,-,-,-,-,I,...,M,-,-,L,-,-,-,-,-,-
4387,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4426,-,-,-,-,-,-,-,-,-,I,...,-,-,-,L,-,-,-,-,-,-
4432,-,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4482,-,-,-,-,-,-,-,-,-,V,...,M,-,-,L,-,-,-,-,-,-


In [24]:
# Read in the consensus sequence
consensus = SeqIO.read('hiv-protease-consensus.fasta', 'fasta')
str(consensus.seq)

'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'

In [25]:
# Replace dashes with consensus letters.
for i, col in enumerate(seq_cols):
    data[col] = data[col].replace('-', str(consensus.seq)[i])
data[seq_cols].head()

Unnamed: 0_level_0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P90,P91,P92,P93,P94,P95,P96,P97,P98,P99
SeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2996,P,Q,I,T,L,W,Q,R,P,I,...,M,T,Q,L,G,C,T,L,N,F
4387,P,Q,I,T,L,W,Q,R,P,L,...,L,T,Q,I,G,C,T,L,N,F
4426,P,Q,I,T,L,W,Q,R,P,I,...,L,T,Q,L,G,C,T,L,N,F
4432,P,Q,I,T,L,W,Q,R,P,L,...,L,T,Q,I,G,C,T,L,N,F
4482,P,Q,I,T,L,W,Q,R,P,V,...,M,T,Q,L,G,C,T,L,N,F


In [58]:
def number_of_combinations(row):
    nc = 1  # nc = "number of combinations"
    for i in row:
        nc = nc * len(i)
        
    return nc

In [90]:
data['num_combinations'] = data[seq_cols].apply(lambda x: number_of_combinations(x), axis=1)
data['has_multiple_mutations'] = data['num_combinations'] > 1
counts = Counter(data['num_combinations'])
data['num_combinations']

SeqID
2996       2
4387       4
4426       1
4432       1
4482       2
4486       2
4538       2
4664       1
4690       2
4698       4
5221       1
5279       1
5444       1
5462       1
5464       1
5640       4
5681       1
5707       2
6024       1
6028       1
7038       2
7042       1
7085       1
7103       1
7119       1
7235       4
7260       2
7393       4
7412       1
7414       8
          ..
258509     1
259173     1
259175     1
259177     2
259181     1
259183    24
259185     2
259187     8
259189     2
259191     1
259193     4
259195     1
259197     2
259199     1
259203    16
259207     2
259211     2
259215     1
259219     4
259223     4
259227     1
259233     2
259237     2
259241     2
259245    32
259249     4
259253     1
259257     1
259261     2
259265     2
Name: num_combinations, dtype: int64

In [61]:
# We will only consider sequences for which less than 10 possible combinations may occur.
(counts[1] + counts[2] + counts[3] + counts[4] + counts[6] + counts[8]) / len(data)
# This should cover 87% of all of the genomes represented in the dataset.

0.8766592920353983

In [62]:
# We will store this in a new variable called "filtered"
filtered = data[data['num_combinations'] < 10]
len(filtered)

1585

In [166]:
# Expansion of columns will be done using the custom function below.
# Pass in the entire dataframe.
def iter_row(row):
    """
    Iterates over every element in a row, and yields a list of that element.
    """
    for i in row:
        yield(list(i))

def expand_mutations(row):
    """
    Expands each row to the total number of possible combinations of sequences.
    
    Returns every combination of mutation.
    """
    return list(it.product(*iter_row(row[seq_cols])))

# Collate list of dictionaries to be used as the input to a new dataframe that contains all of the expanded mutations.
expanded_data = []
for seqid, row in filtered.iterrows():
    
    muts_to_consider = expand_mutations(row)
    
    for i, sequence in enumerate(muts_to_consider):
        new_data = dict()
        new_data['SeqID'] = seqid
        for drug in drug_cols:
            new_data[drug] = row[drug]

        # print(i)
        new_seq = ''
        for s in sequence:
            new_seq += s
        new_data['sequence'] = SeqRecord(Seq(new_seq, alphabet=ProteinAlphabet()), id='{0}-{1}'.format(seqid, i))
        new_data['weight'] = 1 / len(muts_to_consider)
        expanded_data.append(new_data)
expanded_data = pd.DataFrame(expanded_data)
expanded_data.to_csv('hiv-protease-data-expanded.csv')

In [168]:
expanded_data.head(10)

Unnamed: 0,ATV,DRV,FPV,IDV,LPV,NFV,SQV,SeqID,TPV,sequence,weight
0,,,2.5,16.3,,38.6,16.1,2996,,"(P, Q, I, T, L, W, Q, R, P, I, V, T, I, K, I, ...",0.5
1,,,2.5,16.3,,38.6,16.1,2996,,"(P, Q, I, T, L, W, Q, R, P, I, V, T, I, K, I, ...",0.5
2,,,0.7,0.8,,0.8,1.1,4387,,"(P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, V, ...",0.25
3,,,0.7,0.8,,0.8,1.1,4387,,"(P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, V, ...",0.25
4,,,0.7,0.8,,0.8,1.1,4387,,"(P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, V, ...",0.25
5,,,0.7,0.8,,0.8,1.1,4387,,"(P, Q, I, T, L, W, Q, R, P, L, V, T, I, K, V, ...",0.25
6,32.0,,3.0,35.0,32.0,29.0,164.0,4426,,"(P, Q, I, T, L, W, Q, R, P, I, V, T, I, K, I, ...",1.0
7,,,1.5,1.0,,2.2,1.1,4432,,"(P, Q, I, T, L, W, Q, R, P, L, V, T, V, K, I, ...",1.0
8,,,3.9,20.2,,21.6,9.2,4482,,"(P, Q, I, T, L, W, Q, R, P, V, V, T, I, K, I, ...",0.5
9,,,3.9,20.2,,21.6,9.2,4482,,"(P, Q, I, T, L, W, Q, R, P, V, V, T, I, K, I, ...",0.5
