# Dual identification of novel phage receptor-binding proteins based on protein domains and machine learning

This notebook serves as the main document of code and analyses for the detection of phage RBPs in genomes and metagenomes.

## 0. Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer, roc_curve, confusion_matrix, f1_score
%matplotlib inline

In [5]:
# set directories
data_dir = '/Users/Dimi/GoogleDrive/PhD/3_PHAGEBASE/32_DATA/RBP_detection/'
results_dir = '/Users/Dimi/GoogleDrive/PhD/3_PHAGEBASE/33_RESULTS/RBP_detection/'

## 1. Loading and processing data

- Get MillardLab data
- Appy filters to construct RBP vs. non-RBP database + keep phage genome IDs!
- transform into embeddings: cfr protein_embeddings_cloud (compute with GPU in Kaggle notebooks)

In [None]:
# collect raw data and filter
# ...

In [9]:
# load processed data
rbps = pd.read_csv(data_dir+'annotated_RBPs.csv', sep='\t')
nonrbps = pd.read_csv(data_dir+'annotated_nonRBPs.csv', sep='\t')

In [25]:
rbps.head()

Unnamed: 0,NCBI_id,UniProt_id,Tax_id,EMBL_id,Accession,Realm,Family,Organism,Host,ProteinName,ProteinSeq,DNASeq,From,ProteinType
0,QIW86450.1,-,-,-,MT241605,Duplodnaviria,Siphoviridae,Achromobacter phage AMA1,Achromobacter marplatensis,putative tail fiber protein,MATLLLIENAFRPDANPRQEPVRAGERISSMLRRIGYLKGRGAKAV...,ATGGCAACCCTGCTGCTGATTGAAAACGCTTTCCGCCCAGACGCGA...,Alex,RBP
1,QIG62429.1,-,-,-,MT002875,Duplodnaviria,Siphoviridae,Pseudoalteromonas phage AL,Pseudoalteromonas marina,putative tail fiber adhesion protein,MAFDTVKKTFTKEHMWIVELISGETTYRFCENRSPLPLELEAVPSL...,ATGGCTTTTGATACAGTTAAAAAAACATTCACTAAAGAACATATGT...,Alex,RBP
2,QNR53876.1,-,-,-,MT740307,Duplodnaviria,Myoviridae,Pseudomonas phage phiK7A1,Pseudomonas syringae pv. actinidiae K7A1,putative tail fiber protein,MVDITKLDMTNIWASGGDKVTPSAEKIAQGWVVEAVPRQTWNWFEN...,ATGGTGGACATTACTAAACTGGATATGACAAATATCTGGGCAAGCG...,Alex,RBP
3,QNR53879.1,-,-,-,MT740307,Duplodnaviria,Myoviridae,Pseudomonas phage phiK7A1,Pseudomonas syringae pv. actinidiae K7A1,putative tail fiber protein,MANLLKPTGVNNIWSINGTKTDPGLAKANTGWVVELPPYQTANWIE...,ATGGCTAACTTGCTAAAGCCTACAGGTGTGAACAACATCTGGAGTA...,Alex,RBP
4,QNR52067.1,-,-,-,MT833282,Duplodnaviria,Myoviridae,Escherichia coli phage vB_EcoM_Shy,Escherichia coli NCTC 12900,putative tail fiber protein,MAVGEIQISALPQAALPIDLSDIFHLKQGVEDKRCTLEQLLAPHAS...,ATGGCAGTAGGTGAAATTCAAATTAGTGCCTTGCCACAAGCGGCCT...,Alex,RBP


## 2. Learning patterns and evaluating models