# Drug Mining using SVD Recommendation Strategy

## Import Dependencies

In [1]:
import pandas as pd
import numpy as np

## Read Data

- Drug Disease Relationship

In [2]:
init_drug_disease_relation = pd.read_csv("RawData/KnownDrugDiseaseRelation.csv")
init_drug_disease_relation.head()

Unnamed: 0,Drug name,Disease name,Reported Effect,Unnamed: 3,Unnamed: 4
0,3-keto-desogestrel,Amnesia,side effect,,
1,3-keto-desogestrel,"Anemia, Hemolytic, Autoimmune",side effect,,
2,3-keto-desogestrel,"Arrhythmia, Sinus",side effect,,
3,3-keto-desogestrel,Bronchopneumonia,side effect,,
4,3-keto-desogestrel,Burkitt Lymphoma,side effect,,


- Drug Data

In [3]:
drug = pd.read_csv("RawData/Drug.csv")
drug.head()

Unnamed: 0,Drug name,Drugbank id,Mesh id,CAS id,Unnamed: 4
0,3-keto-desogestrel,DB00294,C044815,54048-10-1,
1,4-Aminopyridine,DB06637,D015761,504-24-5,
2,4-hydroxybutyric acid,DB01440,C111420,591-81-1,
3,"5,6,7,8-tetrahydrobiopterin",DB00360,C003402,17528-72-2,
4,6-Mercaptopurine,DB01033,D015122,50-44-2,


- Disease Data

In [4]:
disease = pd.read_csv("RawData/Disease.csv")
disease.head()

Unnamed: 0,Disease name,Mesh id,UMLS id,Unnamed: 3,Unnamed: 4
0,"Abdomen, Acute",D000006,C0000727,,
1,Abdominal Neoplasms,D000008,C0000735,,
2,Abdominal Pain,D015746,C0000737,,
3,"Abnormalities, Drug-Induced",D000014,C0000771,,
4,"Abortion, Spontaneous",D000022,C0000786,,


## Preprocess Data

- The goal of the preprocessing is to create a relationship matrix between drug_id and disease_id
- If there is a positive effect, the relationship is 1
- If there is a negative effect, the relationship is -1
- If there is unknown effect, the relationship is 0

In [5]:
def getDrugbankId(drugName):
    try:
        return drug[drug['Drug name']==drugName]['Drugbank id'].values[0]
    except:
        return "Drug Not Found"

In [6]:
def getDiseaseMeshId(diseaseName):
    try:
        return disease[disease['Disease name']==diseaseName]['Mesh id'].values[0]
    except:
        return "Disease Not Found"

In [7]:
init_drug_disease_relation['Drug bank Id'] = init_drug_disease_relation['Drug name'].apply(lambda x: getDrugbankId(x))
init_drug_disease_relation['Disease Mesh Id'] = init_drug_disease_relation['Disease name'].apply(lambda x: getDiseaseMeshId(x))
init_drug_disease_relation = init_drug_disease_relation.drop(['Drug name', 'Disease name', 'Unnamed: 3', 'Unnamed: 4'],axis=1)

In [8]:
init_drug_disease_relation.head()

Unnamed: 0,Reported Effect,Drug bank Id,Disease Mesh Id
0,side effect,DB00294,D000647
1,side effect,DB00294,D000744
2,side effect,DB00294,D001146
3,side effect,DB00294,D001996
4,side effect,DB00294,D002051


In [13]:
init_drug_disease_relation_matrix = pd.DataFrame()

In [16]:
for i in range(0, len(init_drug_disease_relation)):
    line = init_drug_disease_relation.iloc[i]
    drug_id = line['Drug bank Id']
    disease_id = line['Disease Mesh Id']
    value = line['Reported Effect']
    #init_drug_disease_relation_matrix[disease_id][drug_id] = 
    if drug_id != 'Drug Not Found' and disease_id != 'Disease Not Found':
        if value == 'therapeutic effect	':
            init_drug_disease_relation_matrix.at[drug_id, disease_id] = 1
        if value == 'side effect':
            init_drug_disease_relation_matrix.at[drug_id, disease_id] = -1

In [28]:
init_drug_disease_relation_matrix = init_drug_disease_relation_matrix.fillna(0)
init_drug_disease_relation_matrix.head()

Unnamed: 0,D000647,D000744,D001146,D001996,D002051,D002446,D002532,D002544,D002971,D003128,...,D000690,D003920,D007414,D014811,D019873,D010378,D009459,D016736,D006562,D020138
DB00294,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB01033,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB01267,0.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00284,0.0,-1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
init_drug_disease_relation_matrix.to_csv("init_drug_disease_relation_matrix.csv")

## Start SVD Analysis

In [30]:
u, s, vh = np.linalg.svd(init_drug_disease_relation_matrix)

In [31]:
u.shape, s.shape, vh.shape

((654, 654), (563,), (563, 563))

In [32]:
singular_value_number = s.shape[0]
np.allclose(init_drug_disease_relation_matrix, np.dot(u[:,:singular_value_number] * s, vh))

True

In [33]:
singular_value_number = s.shape[0]
keep_rate = 0.12
keep_number = (int)(singular_value_number * keep_rate)
keep_number

67

In [34]:
reconstruct_drug_disease_relation = init_drug_disease_relation_matrix.copy()
reconstruct_drug_disease_relation.values[:,:] = np.dot(u[:,:keep_number] * s[:keep_number], vh[:keep_number,:])
reconstruct_drug_disease_relation.head()

Unnamed: 0,D000647,D000744,D001146,D001996,D002051,D002446,D002532,D002544,D002971,D003128,...,D000690,D003920,D007414,D014811,D019873,D010378,D009459,D016736,D006562,D020138
DB00294,-0.278317,-0.681843,-0.331201,-0.378043,-0.605217,-0.59033,-0.507194,-0.506194,-0.148257,-0.523923,...,-0.013416,-0.029666,-0.013416,-0.006398,-0.006398,-0.007973,-0.00583,0.013271,-0.000168,-0.003231
DB00360,-0.056903,0.16172,-0.013412,-0.129826,-0.098111,-0.063728,-0.026327,-0.541821,0.005956,-0.35178,...,0.002503,-0.030056,0.002503,0.005982,0.005982,0.008859,0.014114,0.002103,-0.003994,-0.01269
DB01033,0.030332,-0.820692,-0.246105,-0.15409,-0.165664,-0.745996,-0.026524,-0.05384,-0.14855,-0.448419,...,0.050883,0.088423,0.050883,-0.019104,-0.019104,0.004752,0.002638,-0.041895,0.002363,-0.025212
DB01267,-0.077903,-0.046394,-0.375154,0.065177,-0.606484,-0.185595,0.027812,-0.262429,-0.013846,0.096221,...,0.002919,0.002784,0.002919,-3.1e-05,-3.1e-05,-0.018208,-0.003885,0.00587,0.004736,-0.009811
DB00284,-0.386872,-0.468148,-0.215554,-0.622434,-0.090623,0.020544,-0.480147,0.085725,-0.018399,-0.214187,...,0.020377,0.017784,0.020377,0.013496,0.013496,-0.00798,-0.003913,0.018722,-0.007007,-0.000587


## Use Genetics Prediction to Valid our SVD Predicting

In [46]:
validation = pd.read_csv("RawData/MedicalGeneticsPrediction.csv")
validation = validation.drop(["Drug name", "Disease Name", "Drug action", "Function"], axis=1)
validation = validation.replace("side effect", -1)
validation = validation.replace("therapeutic effect", 1)
validation.to_csv("validation.csv")
validation.head()

Unnamed: 0,Drug Drugbank id,Disease Mesh ID,Medical genetics based prediction
0,DB00106,C537919,-1
1,DB00106,D007006,-1
2,DB01193,D006333,-1
3,DB01418,C564741,-1
4,DB00819,D010022,-1


In [47]:
def check_svd_prediction(drug_id, disease_id):
    try:
        return reconstruct_drug_disease_relation[disease_id][drug_id]
    except:
        return "Prediction Not Found"

In [50]:
for index, line in validation.iterrows():
    drug_id = line['Drug Drugbank id']
    disease_id = line['Disease Mesh ID']
    svd_predict = check_svd_prediction(drug_id, disease_id)
    validation.loc[index, 'Raw SVD Predict'] = svd_predict

In [51]:
validation = validation[validation['Raw SVD Predict']!='Prediction Not Found']
validation.head()

Unnamed: 0,Drug Drugbank id,Disease Mesh ID,Medical genetics based prediction,Raw SVD Predict
2,DB01193,D006333,-1,-0.0369814
7,DB00459,D015473,-1,-0.0433578
11,DB00210,D015473,-1,-0.164438
21,DB00404,D004832,1,0.332956
37,DB00182,D020734,-1,-0.27192


## Process SVD Prediction, negative to -1, positive to 1

In [53]:
def process_raw_predict(m):
    if m<0:
        return -1
    elif m==0:
        return 0
    else:
        return 1

In [54]:
validation['SVD Predict'] = validation['Raw SVD Predict'].apply(lambda x: process_raw_predict(x))
validation.head()

Unnamed: 0,Drug Drugbank id,Disease Mesh ID,Medical genetics based prediction,Raw SVD Predict,SVD Predict
2,DB01193,D006333,-1,-0.0369814,-1
7,DB00459,D015473,-1,-0.0433578,-1
11,DB00210,D015473,-1,-0.164438,-1
21,DB00404,D004832,1,0.332956,1
37,DB00182,D020734,-1,-0.27192,-1


## Statistics on the rate of SVD Prediction that also occurs in Medical Genetics Prediction

In [56]:
for index, line in validation.iterrows():
    medical_prediction = line['Medical genetics based prediction']
    svd_prediction = line['SVD Predict']
    result = 0
    if medical_prediction == svd_prediction:
        result = 1
    
    validation.loc[index, 'Prediction Right'] = result

In [60]:
validation[10:20]

Unnamed: 0,Drug Drugbank id,Disease Mesh ID,Medical genetics based prediction,Raw SVD Predict,SVD Predict,Prediction Right
124,DB00564,D014693,-1,-0.106189,-1,1.0
130,DB00521,D006333,-1,-0.081127,-1,1.0
136,DB00475,D004832,1,0.248697,1,1.0
171,DB00882,D001943,-1,-0.2096,-1,1.0
173,DB00882,D008881,1,-0.166484,-1,0.0
185,DB00907,D014693,-1,0.0317429,1,0.0
187,DB00924,D012559,-1,-0.0196695,-1,1.0
189,DB00434,D012559,-1,-0.00505454,-1,1.0
204,DB01254,D046152,1,-0.013152,-1,0.0
205,DB01254,D009373,1,0.00820425,1,1.0


In [62]:
len_validation = len(validation)
len_predict_right = len(validation[validation['Prediction Right']==1])
right_rate = len_predict_right / len_validation
print(f"{right_rate*100}% in SVD Prediction occcurs in Medical Genetics Prediction")

62.88659793814433% in SVD Prediction occcurs in Medical Genetics Prediction
