# Imports

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statistics import mean

from data_makers import *
import utils
from utils import *
import mean_model
from mean_model import meanModel

import sklearn
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler
from joblib import parallel_backend
from ray.util.joblib import register_ray

from xgboost import XGBRegressor

import heapq

import shap
from shap.explainers import Tree
from interpret.blackbox import ShapKernel

import interpret.glassbox
from interpret import show

# dataframe makers

## phospho

In [None]:
#read in the X dataframe
X_phos = pd.read_csv('data/X_phos', index_col=0)

#read in the y dataframe
y_phos = pd.read_csv('data/y_phos', index_col=0)

#dl maker 
dl_phos = dlMaker(y_phos)

#one hot representations of drugs from y
hotdrugsDF_phos = one_hot_maker(y_phos)

#produce X-main and y_main
cl_phos = clMaker(X_phos, y_phos)
x_all_phos, x_drug_phos, y_main_phos = create_all_drugs(x=X_phos, xd=hotdrugsDF_phos, y=y_phos, cells=cl_phos)
X_main_phos = X_main_maker(x_all_phos, x_drug_phos, short = False)

## proteomic

In [None]:
#read in the X dataframe
X_prot = pd.read_csv('data/X_prot', index_col=0)

#read in the y dataframe
y_prot = pd.read_csv('data/y_prot', index_col=0)

#dl maker 
dl_prot = dlMaker(y_prot)

#one hot representations of drugs from y
hotdrugsDF_prot = one_hot_maker(y_prot)

#produce X-main and y_main
cl_prot = clMaker(X_prot, y_prot)
x_all_prot, x_drug_prot, y_main_prot = create_all_drugs(x=X_prot, xd=hotdrugsDF_prot, y=y_prot, cells=cl_prot)
X_main_prot = X_main_maker(x_all_prot, x_drug_prot, short = False)

# InterpretML explainer

## feat selected

random forest feature_importance_ data read in to see if interpretML produced a similar order of features

In [None]:
#read in the feature_importances_ feature selected data
feature_list = []
with open("feat_select_files/phospho/rfr_X_main/rfr_feat_select.txt", "r") as features:
    lines = features.readlines()
    for i in lines:
        i.replace(" ", "")
        feature_list.append(i.split(":")[0])

In [None]:
#create a new X dataframe with the selected features
X_features = X.reindex(feature_list[:1000],axis="columns")

#produce the other required dataframes
hotdrugsDF_feats = one_hot_maker(y)
x_all_feats, x_drug_feats, y_main_feats = create_all_drugs(x=X_features, xd=hotdrugsDF_feats, y=y, cells=cl)
X_main_feats = X_main_maker(x_all_feats, x_drug_feats, short = False)

In [None]:
# split the data 
X_train_feats, X_test_feats, y_train_feats, y_test_feats = cell_line_split(X_main_feats, y_main_feats, test_size=0.2, random_state = 88 )

## non feat selected

running the interpretML with all features, not just those selected by random forest

In [None]:
#split the data
X_train, X_test, y_train, y_test = cell_line_split(X_main, y_main, test_size=0.2, random_state = 88)

In [None]:
#interpretML EBM model training
ebm = interpret.glassbox.ExplainableBoostingRegressor(interactions=0, n_jobs=-1)
ebm.fit(X_train, y_train)

In [None]:
#print the results from the interpretML model

result = ebm.predict(X_test)
score = ebm.score(X_test, y_test)
print(f'score: {score}, result: {result}')

In [None]:
#this function outputs the top x number of features and their scores for a model

final_names, final_scores = ebmFeatures(model = ebm, topX = 10411, N = 10411)

In [None]:
plot_name = 'plots/ML_proteomic_rs88.png'

In [None]:
# save the plots

plt.rcParams['figure.figsize'] = [20, 20]
plt.plot(final_names[:50], final_scores[:50], linestyle='-', marker='.', color='#009d9a', linewidth=1)
rot = plt.xticks(rotation=45)
plt.savefig(plot_name)

In [None]:
# create a dictionary format to save important features

interpretMLdict = {final_names[i]:final_scores[i] for i in range(len(final_names))}

In [None]:
file_name = "feat_select_files/proteomic/ML_feat_select_rs88.txt"

In [None]:
# save the data

with open(file_name, "w") as txt_file:
    for key, value in interpretMLdict.items():
        txt_file.write(key +':'+ str(value) + "\n") 

In [None]:
#Make final_names/final_scores from feature txt file
final_names = []
final_scores = []
with open(file_name, "r") as features:
    lines = features.readlines()
    for i in lines:
        phospho = i.split(':')[0]
        score = i.split(':')[1]
        score = score.split("\n")[0]
        final_names.append(phospho)
        final_scores.append(float(score))

In [None]:
#plot imported data

plt.rcParams['figure.figsize'] = [20, 20]
plt.plot(final_names[:100], final_scores[:100], linestyle='-', marker='.', color='#009d9a', linewidth=1)
rot = plt.xticks(rotation=45)
plt.savefig('my_plot.png')