# Simple Matrix Factorization Collaborative Filtering for Drug Repositioning on Diseases

The discovery of new biological interactions, such as interactions between drugs and diseases

In [None]:
import pandas as pd
dis_drg = pd.read_csv(filepath_or_buffer='data/email_attachments/diseases_drugs_th.csv', index_col=0)

# cell_df = pd.read_csv(filepath_or_buffer='data/cell_summary.csv', index_col=0)
# comp_df = pd.read_csv(filepath_or_buffer='data/comp_summary.csv', index_col=0)


Next program is to GridSearch the best algorithm and then compare training performance with test dataset v27

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import accuracy
import numpy as np

dis_drg['Interaction'] = np.ones(dis_drg.shape[0])
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(dis_drg, reader)
    # activity_24[['cell_id', 'scomp_id', 'activity']], reader)

param_grid = {'n_factors': [10, 50, 100, 200, 300],
              'n_epochs': [30], #Best is: 300
              'lr_all': [0.002], #Best is .002
              'reg_all': [0.1], #Best is .1
              'biased' : [True]
              }

param_grid = {'n_factors': [10, 50, 100, 200, 300, 400, 500, 1000, 2000, 5000], #Best is: 10
              'n_epochs': [10, 50, 100, 200, 300, 400, 500], #Best is: 300
              'lr_all': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05], #Best is .002
              'reg_all': [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0], #Best is .1
              'biased' : [True, False] #Best is True
              }

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse', 'mae'], cv=10, n_jobs=-1, refit=True)
grid_search.fit(data)

algo = grid_search.best_estimator['rmse']
print(algo)

Now, we perform predictions

In [None]:
from surprise.model_selection import train_test_split

trainset = data.build_full_trainset()
# trainset, testset = train_test_split(data, test_size=.2)
algo.fit(trainset)
predictions = algo.test(trainset.build_anti_testset(fill=0))
# predictions = algo.test(trainset.build_testset())
# predictions = algo.test(testset)
print('Biased accuracy on v24,', end='   ')

accuracy.rmse(predictions)


Finally, we calculate and plot ROC and AUC

In [None]:
r = list()
est = list()
for p in predictions:
    r.append(p.r_ui)
    est.append(p.est)

import matplotlib.pyplot as plt
plt.scatter(r, est)
plt.show()
plt.savefig('img/surprise_gridcv_v24v27.png')
plt.close()

#ROC Curve
from sklearn import metrics
y = pd.DataFrame({'r':r, 'est':est})

y.loc[y['r'] == -1, 'r'] = 0

fpr, tpr, thresholds = metrics.roc_curve(y_true=y['r'], y_score=y['est'])
auc = metrics.roc_auc_score(y_true=y['r'], y_score=y['est'])
print('AUC: %.9f' % auc)


import matplotlib.pyplot as plt
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
plt.savefig('img/surprise_roc_v24v27.png')
plt.close()