In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib as jb
import pickle


In [11]:
targets_balanced_list = ['Q96GD4', 'P49841', 'Q13627', 'P06239', 'Q13464', 'P11309']

In [12]:
#I want to train a simple logistic regression model on the data and save the model and the results on test data
def train_log_reg(df_train, df_test, target, path, random_state=42):
    X_train = np.array(df_train['X'].tolist())
    y_train = df_train['Y_binary']
    
    X_test = np.array(df_test['X'].tolist())
    y_test = df_test['Y_binary']
    
    model = LogisticRegression(random_state=random_state)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    model_path = os.path.join(path, f'{target}_log_reg_{random_state}.joblib')
    jb.dump(model, model_path)
    
    return acc

In [19]:
split =42
random_state=42
accs = []
task = 'vs_Weak'
for target in targets_balanced_list:
    path_to_target = os.path.join('./data/processed', target)
    path_vs_task = os.path.join(path_to_target, task)
    
    df_train = pd.read_pickle(f'./data/processed/{target}/{task}/split_{split}/train.pkl')
    df_test = pd.read_pickle(f'./data/processed/{target}/{task}/split_{split}/test.pkl')
    acc = train_log_reg(df_train, df_test, target, path_vs_task, random_state=random_state)
    accs.append(acc)
vs_weak_acc = pd.DataFrame(zip(targets_balanced_list, accs), columns=['Target', 'Accuracy'])
vs_weak_acc

Unnamed: 0,Target,Accuracy
0,Q96GD4,0.503817
1,P49841,0.486667
2,Q13627,0.608
3,P06239,0.625
4,Q13464,0.575
5,P11309,0.552239


In [20]:
task = 'vs_Non-binder'
accs = []
for target in targets_balanced_list:
    path_to_target = os.path.join('./data/processed', target)
    path_vs_task = os.path.join(path_to_target, task)
    
    df_train = pd.read_pickle(f'./data/processed/{target}/{task}/split_{split}/train.pkl')
    df_test = pd.read_pickle(f'./data/processed/{target}/{task}/split_{split}/test.pkl')
    acc = train_log_reg(df_train, df_test, target, path_vs_task, random_state=random_state)
    accs.append(acc)
vs_non_binder_acc = pd.DataFrame(zip(targets_balanced_list, accs), columns=['Target', 'Accuracy'])
vs_non_binder_acc

Unnamed: 0,Target,Accuracy
0,Q96GD4,0.572519
1,P49841,0.633333
2,Q13627,0.592
3,P06239,0.698529
4,Q13464,0.641667
5,P11309,0.664179
