In [1]:
import copy
import csv
import pickle

import numpy as np
import pandas as pd  
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                             recall_score, roc_curve,roc_auc_score)


In [2]:

def read_csv_files(file_path: str):
    with open(file_path, newline="") as file:
        reader = csv.reader(file, delimiter="\t", lineterminator="\n")
        arr = []
        for x in reader:
            t = ()
            t = t + (x[0],) + (x[1],)
            arr.append(t)
    return arr


In [3]:
def create_pairs_order_uniq(iteration, seed=30):
    
    filename_1 = "./asymmetric_pairs/trainingpositive_idp_crossval"+str(iteration)+".csv"
    training_positive_pairs = read_csv_files(filename_1)

    filename_2 = "./asymmetric_pairs/testpositive_idp_crossval"+str(iteration)+".csv"
    test_positive_pairs = read_csv_files(filename_2)

    filename_3 = "./asymmetric_pairs/outbalanced_c2_16_negative_training_crossval"+str(iteration)+".csv"
    training_negative_pairs = read_csv_files(filename_3)

    filename_4 = "./asymmetric_pairs/testnegative_crossval"+str(iteration)+".csv" 
    test_negative_pairs = read_csv_files(filename_4)

    positive_pairs = training_positive_pairs + test_positive_pairs
    negative_pairs = training_negative_pairs + test_negative_pairs

    all_pairs = [
        training_positive_pairs,
        test_positive_pairs,
        training_negative_pairs,
        test_negative_pairs,
    ]

    return (positive_pairs, negative_pairs, all_pairs)

In [4]:

# %%

def preprocessing_features_orig(input_feature,operator,positive_pairs,negative_pairs):
    
    newdict=input_feature.to_dict(orient="index")

   
    dfs=[]
    for pairs in (positive_pairs,negative_pairs):
        #print(pairs)
       
        if operator == 'abs_minus' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: abs(test_dict2[key] - test_dict1.get(key, 0)) for key in test_dict2.keys()}  
                features2[keys] = res

        if operator == 'sum' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: test_dict2[key] + test_dict1.get(key, 0) for key in test_dict2.keys()}  
                features2[keys] = res


        if operator == 'minus' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: test_dict1[key] - test_dict2.get(key, 0) for key in test_dict2.keys()}
                features2[keys] = res  


        if operator == 'multiply' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: test_dict2[key] * test_dict1.get(key, 0) for key in test_dict2.keys()}  
                features2[keys] = res
               

           
        dfObj = pd.DataFrame(features2)
        dfObj=dfObj.transpose()
        dfs.append(dfObj)

       
   

       
    return(pd.concat(dfs))


# %%

def preprocessing_features_single(input_feature,operator,given_pairs):
    
    newdict=input_feature.to_dict(orient="index")   
    dfs=[]
    for pairs in [given_pairs]:
       
             
               

        if operator == 'minus' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: test_dict1[key] - test_dict2.get(key, 0) for key in test_dict2.keys()}  
                features2[keys] = res  
               
        if operator == 'abs_minus' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: abs(test_dict2[key] - test_dict1.get(key, 0)) for key in test_dict2.keys()}  
                features2[keys] = res
               
        if operator == 'multiply' :
            features2={}
            for keys in pairs:
                test_dict1=newdict[keys[0]]
                test_dict2=newdict[keys[1]]
                #print(test_dict1)
                #print(test_dict2)
                res = {key: test_dict2[key] * test_dict1.get(key, 0) for key in test_dict2.keys()}  
                features2[keys] = res
 
       
           
        dfObj = pd.DataFrame(features2)
        dfObj=dfObj.transpose()
        dfs.append(dfObj)

       
   

       
    return(pd.concat(dfs))


# %%




# make wee scoring function
def ScoreMe(model, y_test, preds):
    print("{}: accuracy report".format(model))
    print("The accuracy is {}".format(accuracy_score(y_test,preds)))
    print("The recall is {}".format(recall_score(y_test, preds)))
    print("The precision is {}".format(precision_score(y_test, preds)))
    print(confusion_matrix(y_test, preds))
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    specificity = tn / (tn+fp)
    print("specificity is {}".format(specificity))
    print("The  F1 score  is {}".format(f1_score(y_test, preds)))
    return()




# make wee scoring function
def ScoreMe_2(model, y_test, preds):
    acc=accuracy_score(y_test,preds)
    recall=recall_score(y_test, preds)
    prec=precision_score(y_test, preds)
    f1=f1_score(y_test, preds)
    return([acc,recall,prec,f1])


# %%


In [5]:

files_toRead=["./features/protR_IDRs_concenated_15.txt"]


In [7]:
for q in range(1,11):
    for r in range(1):
        
        print("Cross-validation - Fold: {} ".format(q))

       
        filetoRead=files_toRead[r]



        output=create_pairs_order_uniq(iteration=q,seed=30)
        positive_pairs=output[0]
        negative_pairs=output[1]
        all_pairs=output[2]
           
        df_ProtR =pd.read_csv(filetoRead,sep="\t",header=0,index_col=0)




        result=preprocessing_features_orig(df_ProtR,'minus',positive_pairs,negative_pairs)

                               
                   
        listOfStrings1 = [1  for i in range(len(positive_pairs))]
        listOfStrings2 = [0  for i in range(len(negative_pairs))] 
        result['interaction']= listOfStrings1+listOfStrings2

        balanced_index=copy.deepcopy(all_pairs)

           
        trainingpositive_df=result.loc[balanced_index[0] , : ]
        trainingnegative_df= result.loc[balanced_index[2] , : ]

        testpositive_df=result.loc[ balanced_index[1] , : ]
        testnegative_df=result.loc[ balanced_index[3] , : ]


        
        training_subsetted=[trainingpositive_df,trainingnegative_df]

        df = pd.concat(training_subsetted)
        y_train = df.interaction
        X_train = df.drop(['interaction'], axis=1)
        

        test_subsetted=[testpositive_df,testnegative_df]
        df_result_test = pd.concat(test_subsetted)

        y_test = df_result_test.interaction
        X_test = df_result_test.drop('interaction', axis=1)
               


        rfminus = RandomForestClassifier(random_state=42,n_jobs=-1,min_samples_split=0.1,max_depth=5)
        rfminus.fit(X_train,y_train)


        ###

        ## import test pairs that are already sorted based on the known node
        
        filename_1= "./asymmetric_pairs/ordered_testpairs/test_file_ordered_x_"+str(q)+".pkl"
        filename_2= "./asymmetric_pairs/ordered_testpairs/test_file_ordered_y_"+str(q)+".pkl"
        
        
    
        with open(filename_1, 'rb') as f:
            testpairs_pickle = pickle.load(f)

        with open(filename_2, 'rb') as f:
            labels_pickle = pickle.load(f)
        
    

        X_test_updated=preprocessing_features_single(df_ProtR,'minus',testpairs_pickle)
        X_test_updated=X_test_updated[X_train.columns]
        
        
        
        preds = rfminus.predict(X_test_updated)
        probs= rfminus.predict_proba(X_test_updated)
        
    
        score = roc_auc_score(labels_pickle, probs[:, 1])
        print(f"ROC AUC : {score:.4f}")
        
        
        
       

Cross-validation - Fold: 1 
ROC AUC : 0.7356
Cross-validation - Fold: 2 
ROC AUC : 0.6807
Cross-validation - Fold: 3 
ROC AUC : 0.6878
Cross-validation - Fold: 4 
ROC AUC : 0.7589
Cross-validation - Fold: 5 
ROC AUC : 0.6625
Cross-validation - Fold: 6 
ROC AUC : 0.7890
Cross-validation - Fold: 7 
ROC AUC : 0.6835
Cross-validation - Fold: 8 
ROC AUC : 0.7095
Cross-validation - Fold: 9 
ROC AUC : 0.6543
Cross-validation - Fold: 10 
ROC AUC : 0.6898
