In [1]:
import numpy as np
import pandas as pd
import time
from collections import OrderedDict
from sklearn import preprocessing
from matplotlib import pyplot as plt
import faiss
from sklearn import neighbors
import annoy
from sklearn import cluster

# Data Preparation

In [2]:
%%time
#200000 compounds with 469 features
all_features = pd.read_csv("c2vpoint2m.txt.gz", sep="\t", header=None)

CPU times: user 33.8 s, sys: 9.64 s, total: 43.4 s
Wall time: 1min 12s


In [3]:
#make the compound names the index
all_features.set_index(0, inplace=True)

In [4]:
%%time
#725634 compounds with target and activity
cgan = pd.read_csv("activities.txt.gz", sep="\t")

CPU times: user 28.7 s, sys: 12.6 s, total: 41.2 s
Wall time: 38 s


In [5]:
#we are only interested in the compounds whose target is EGFR
egfr_activity = cgan[cgan['target'] == 'EGFR'].set_index('compound')

In [6]:
#separate into two dataframes, one with active compounds
egfr_active = egfr_activity[egfr_activity['activity'] >= 6]
egfr_active_compounds = egfr_active.index.values

In [7]:
#and one with inactive ones
egfr_inactive = egfr_activity[egfr_activity['activity']  < 6]
egfr_inactive_compounds = egfr_inactive.index.values

In [8]:
#5275 compounds with EGFR target and 469 features
egfr_features = pd.read_csv("egfr.c2v.txt", sep="\t", header=None).set_index(0)

In [9]:
#isolate the names for easier access
egfr_compounds = egfr_features.index.values

In [10]:
#make dataframes with features for the active and inactive compounds
egfr_active_features = egfr_features.loc[egfr_active_compounds]
egfr_inactive_features = egfr_features.loc[egfr_inactive_compounds]

In [11]:
#scale the data so that each feature is normalized and can be compared to other features
min_max_scaler = preprocessing.MinMaxScaler() #fits data between 0 and 1

In [12]:
#normalize the active compounds
egfr_active_features_norm = pd.DataFrame(min_max_scaler.fit_transform(egfr_active_features), 
                                         index=egfr_active_features.index)

In [13]:
#normalize the inactive compounds
egfr_inactive_features_norm = pd.DataFrame(min_max_scaler.fit_transform(egfr_inactive_features), 
                                           index=egfr_inactive_features.index)

In [14]:
#remove duplicates from all_features - can also use DataFrame.drop_duplicates()
all_features_cleaned = all_features.loc[list(set(all_features.index.values) - set(egfr_compounds))]

In [15]:
#normalize the database
all_features_cleaned_norm = pd.DataFrame(min_max_scaler.fit_transform(all_features_cleaned), index=all_features_cleaned.index)

# Clusters


In [16]:
#use sklearn's KMeans algorithm to split the data of the active compounds into similar clusters
kmeans = cluster.KMeans(n_clusters=10, random_state=0).fit(egfr_active_features)

In [17]:
temp = pd.DataFrame({'labels':kmeans.labels_})

In [22]:
temp['labels'].value_counts()

1    494
5    487
9    409
8    392
3    280
7    213
6    162
0    149
2     91
4      1
Name: labels, dtype: int64

In [19]:
#Mainly working with normalized data
kmeans2 = cluster.KMeans(n_clusters=10, random_state=0).fit(egfr_active_features_norm)

In [None]:
#create a list of lists of indices corresponding to the different clusters
indices = []
for i in range(10):
    indices.append([])
for i in range(len(kmeans2.labels_)):
    indices[kmeans2.labels_[i]].append(i)
indices = [[egfr_active_features_norm.index.values[j] for j in i] for i in indices]

In [36]:
indices

[['ACZLAVUSXSLQRQ-VEJCKLOMNA-N',
  'AKWVKIGCMCHYJT-ZFJUVRDGNA-N',
  'APBCLSCKMPCKMF-LQFNOIFHNA-N',
  'ARTXCXRRTXRAOJ-UHFFFAOYNA-N',
  'ASCVQXWPIHGQIB-HGKQKYKZNA-N',
  'AXCWTPZBNBDGJR-MRSUPTMINA-N',
  'BMGQWWVMWDBQGC-SAQRDVLPNA-N',
  'BMSYYYVQJGFDHW-BVUZABKHNA-N',
  'BRBGAJDUHBVSLV-UHFFFAOYNA-N',
  'BSFNGHCTZDCQBF-NSJMMFDCNA-N',
  'BUAIGJRXIAKFCN-UHFFFAOYNA-N',
  'BUUKIEKQOHCZPU-UHFFFAOYNA-N',
  'CCGBAJCQZPJWCS-FDCKXEGSNA-N',
  'CGGGWVZOXQYNNH-VPGSJBGJNA-N',
  'CWFIQHQECWPWQG-QGSRTERENA-N',
  'CXAHRFUHLAWAPM-USCOTLSHNA-N',
  'DEEISBGSUUROMS-UHFFFAOYNA-N',
  'DMZTWOCJNVKCBW-OKPOJWAQNA-N',
  'DOKIBWCVABZHOH-LELJVTLKNA-N',
  'DURYAAKIQVDWGP-TWSYTRIPNA-N',
  'FOLBKSQFUPUWKJ-UHFFFAOYNA-N',
  'GALVWTFFXPRMQM-FNUBFHOMNA-N',
  'GFTOODFOXIDLNF-RTNDPQTINA-N',
  'GPGZMTPPOHCJRE-XBSUQPIJNA-O',
  'HADVGAWGEIOCTD-MRSUPTMINA-N',
  'HANZKDSBPBFJQB-DMJBNCFNNA-N',
  'HCTDQRULARFYTB-CSKMVECVNA-N',
  'HHFBDROWDBDFBR-PINXXQJSNA-N',
  'HJEPFPDXSFKWQH-WYJXGVAINA-N',
  'HKSZLNNOFSGOKW-PULLEYNNNA-N',
  'HKSZLNN

In [25]:
#split the dataframe into 10 different ones (one for each cluster) and keep them together in a list
clusters = []
for i in range(10):
    clusters.append(egfr_active_features_norm.loc[indices[i]])

In [40]:
for c in clusters:
    print(c.shape)

(149, 469)
(494, 469)
(91, 469)
(280, 469)
(1, 469)
(487, 469)
(162, 469)
(213, 469)
(392, 469)
(409, 469)


In [112]:
#choose one of the larger clusters as a query
query = clusters[1]

In [113]:
#make the rest of the data into the database
database = pd.concat(clusters[:1] + clusters[2:] + [all_features_cleaned_norm, egfr_inactive_features_norm])

In [114]:
#create the index
index = faiss.IndexFlatL2(469)

In [115]:
#add the training data from the database
index.add(np.ascontiguousarray(database, 'float32'))

In [128]:
#run the query
D, I = index.search(np.ascontiguousarray(query, 'float32'), 200)

In [130]:
#calculate quality
sum = 0
for i in I:
    count = 0
    for j in i:
        if database.index.values[j] in egfr_active.index.values:
            count += 1
    sum += count/float(200)
average = sum/query.shape[0]

KeyboardInterrupt: 

In [None]:
average

In [123]:
#calculate how many of the retrieved neighbors were in each cluster
sums = []
for i in range(10):
    sums.append(0)
    
for i in I:
    for j in i:
        for c in range(len(clusters)):
            if database.index.values[j] in clusters[c].index.values:
                sums[c] += 1

In [124]:
sums

[0, 0, 128, 8116, 0, 64607, 213, 36, 3601, 22099]