In [19]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import implicit
import scipy
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

df1 = pd.read_excel("./drugnsc.xlsx")
df2 = pd.read_excel("./nscquerylist.xlsx")

In [20]:
df1.sample(3)

Unnamed: 0,NSC # b,Drug name,FDA status,Mechanism of action c,PubChem SID,SMILES d,BR:MCF7,BR:MDA-MB-231,BR:HS 578T,BR:BT-549,...,RE:786-0,RE:A498,RE:ACHN,RE:CAKI-1,RE:RXF 393,RE:SN12C,RE:TK-10,RE:UO-31,Total experiments,Total after quality control
8934,658490,-,-,-,508320,CN1\C(=C/c2cccc(Cl)c2Cl)\C(=O)N(\C=C\3/C(=O)Oc...,-0.36,-0.5,1.37,-0.15,...,-0.62,1.2,0.21,-0.42,-0.55,-0.82,na,na,1,1
20864,763123,-,-,-,-,CCN(Cc1ccc(OC)cc1)c2ccc(CO)c(Cl)c2,1.51,-0.83,-0.9,-0.64,...,-0.54,2.26,-0.83,0.57,-0.35,-0.7,1.94,-0.28,2,2
7768,647050,-,-,-,503345,C1CCCC2=O[Nd+3]345(O=C(CCCCCCC6=O[Nd+3]78(O=C(...,na,na,na,na,...,-0.29,-0.29,-0.29,-0.29,na,-0.29,-0.29,-0.29,1,1


In [21]:
copy_df1 = df1.drop('Drug name', 1).drop('FDA status', 1).drop('Mechanism of action c', 1).drop('PubChem SID', 1).drop('SMILES d', 1).drop('Total experiments', 1).drop('Total after quality control', 1)


In [22]:
copy_df1 = copy_df1.set_index('NSC # b')
copy_df1 = copy_df1.replace('na', '0.00')
copy_df1 = copy_df1.stack().reset_index().rename(columns={'level_0':'drug','level_1':'cancer_cell_lines', 0:'z_score'})

In [23]:
copy_df1.sample(3)

Unnamed: 0,NSC # b,cancer_cell_lines,z_score
998641,717167,BR:MDA-MB-231,-1.84
100006,99282,OV:OVCAR-5,-0.42
822041,694256,LC:NCI-H460,-0.25


In [24]:
copy_df1['NSC # b'] = copy_df1['NSC # b'].astype("category")
copy_df1['cancer_cell_lines'] = copy_df1['cancer_cell_lines'].astype("category")

In [25]:
copy_df1.sample(3)

Unnamed: 0,NSC # b,cancer_cell_lines,z_score
678089,676553,ME:SK-MEL-5,0.98
547611,661096,PR:DU-145,-0.16
1219825,756152,ME:MALME-3M,-0.78


In [33]:
zscores = scipy.sparse.csr_matrix((copy_df1['z_score'].astype(float),
                       (copy_df1['cancer_cell_lines'].cat.codes.copy(),
                        copy_df1['NSC # b'].cat.codes.copy())))
zscores = bm25_weight(zscores, K1=100, B=0.8)
model = implicit.als.AlternatingLeastSquares(factors=40,
                                        regularization=20,
                                        use_native=False,
                                        use_cg=False,
                                        iterations=5)
model.fit(zscores)

In [34]:
# Recommend a cancer cell line for a drug - [0,21738]
recommended_cell_lines = model.recommend(10000, zscores2.T)
recommended_cell_lines

[(52, 9.8148261034252864e-59),
 (55, 3.7838474234825754e-59),
 (51, 8.3580766184834039e-60),
 (4, 3.7313418799020025e-60),
 (6, 3.7096232323349794e-60),
 (12, 1.8494135314887833e-60),
 (21, 1.8064830984479417e-60),
 (7, 1.717005784060618e-60),
 (1, 1.322375619206584e-60),
 (56, 1.0156411538467729e-60)]

In [35]:
# Similar cancer cell line - [0,60]
similar_cell_lines = model.similar_items(5)
similar_cell_lines

[(5, 1.0),
 (57, 0.99828772783059194),
 (48, 0.98579062777603643),
 (3, 0.96190837657887174),
 (20, 0.96152840884781665),
 (40, 0.88753468756304621),
 (50, 0.88684557873590242),
 (42, 0.82350697586415922),
 (45, 0.78840396726461581),
 (9, 0.77097874813317513)]

In [36]:
zscores.nnz
zscores

<60x21738 sparse matrix of type '<class 'numpy.float64'>'
	with 1304280 stored elements in COOrdinate format>