# packages and imports

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statistics import mean

from data_makers import *
import utils
from utils import *
import mean_model
from mean_model import meanModel

import sklearn
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler
from joblib import parallel_backend
from ray.util.joblib import register_ray

from xgboost import XGBRegressor

import heapq

import shap
from shap.explainers import Tree
from interpret.blackbox import ShapKernel

import interpret.glassbox
from interpret import show


# dataframe makers

## phospho

In [None]:
#read in the X dataframe
X_phos = pd.read_csv('data/X_phos', index_col=0)

#read in the y dataframe
y_phos = pd.read_csv('data/y_phos', index_col=0)

#dl maker 
dl_phos = dlMaker(y_phos)

#one hot representations of drugs from y
hotdrugsDF_phos = one_hot_maker(y_phos)

#produce X-main and y_main
cl_phos = clMaker(X_phos, y_phos)
x_all_phos, x_drug_phos, y_main_phos = create_all_drugs(x=X_phos, xd=hotdrugsDF_phos, y=y_phos, cells=cl_phos)
X_main_phos = X_main_maker(x_all_phos, x_drug_phos, short = False)

## proteomic

In [None]:
#read in the X dataframe
X_prot = pd.read_csv('data/X_prot', index_col=0)

#read in the y dataframe
y_prot = pd.read_csv('data/y_prot', index_col=0)

#dl maker 
dl_prot = dlMaker(y_prot)

#one hot representations of drugs from y
hotdrugsDF_prot = one_hot_maker(y_prot)

#produce X-main and y_main
cl_prot = clMaker(X_prot, y_prot)
x_all_prot, x_drug_prot, y_main_prot = create_all_drugs(x=X_prot, xd=hotdrugsDF_prot, y=y_prot, cells=cl_prot)
X_main_prot = X_main_maker(x_all_prot, x_drug_prot, short = False)

# XGBoost model

In [None]:
#train-test-split
X_train, X_test, y_train, y_test = cell_line_split(X_main, y_main, test_size=0.2, random_state = 88)

In [None]:
# instantiate the XGBoost model using hyper-parameters

xgb = XGBRegressor(max_depth = 75, 
                   n_estimators = 300, 
                   seed = 42, 
                   min_child_weight = 3, 
                   gamma = 0, 
                   colsample_bytree = 0.3, 
                   reg_alpha = 0.1,
                   n_jobs=-1)

In [None]:
# fit the model

xgb.fit(X_train, y_train)
feat_importance = xgb.feature_importances_

In [None]:
# print accuracy metrics

y_pred = xgb.predict(X_test)
score = xgb.score(X_test, y_test)
print(score)

In [None]:
#this function outputs the top x number of features and their scores for a model

xgb_largest_names, xgb_largest_scores = xgbFeatures(feat_importance, X_main = X_train, topX = 10411, N = 10411)

In [None]:
plot_name = 'plots/xgb_proteomic_rs88.png'

In [None]:
#plot the most important features

plt.rcParams['figure.figsize'] = [20, 20]
plt.plot(xgb_largest_names[:50], xgb_largest_scores[:50], linestyle='-', marker='.', color='#009d9a', linewidth=1)
rot = plt.xticks(rotation=45)
plt.savefig(plot_name)

In [None]:
# create a dictionary format to save important features 

xgbdict = {xgb_largest_names[i]:xgb_largest_scores[i] for i in range(len(xgb_largest_names))}

In [None]:
file_name = 'feat_select_files/proteomic/xgb_feat_select_rs88.txt'

In [None]:
# save features

with open(file_name, "w") as txt_file:
    for key, value in xgbdict.items():
        txt_file.write(key +':'+ str(value) + "\n") 