# PREDICTION USING LOGISTIC REGRESSION, RANDOM FOREST, GRADIENT BOOSTED TREES, SUPPORT VECTOR MACHINE, ARTIFICIAL NEURAL NETWORK

# IMPUTATION

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from numpy import isnan

In [None]:
df = pd.read_csv("/PATH/TO/FILE.csv")

In [None]:
pip install sklearn

In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer
# import normally from sklearn.impute
from sklearn.impute import IterativeImputer
# import BayesianRidge model
from sklearn.linear_model import BayesianRidge

In [None]:
# define imputer
imputer = IterativeImputer(estimator=BayesianRidge(), # default
                           n_nearest_features=None, # default
                           imputation_order='ascending', # uses features with fewest missing values first
                           initial_strategy='most_frequent') # used for categorical data

In [None]:
# fit the imputer on the dataset
imputer.fit(df)

In [None]:
# transform the dataset
transformed_df = imputer.transform(df)

In [None]:
# print total missing
print('Missing: %d' % sum(isnan(transformed_df).flatten()))

In [None]:
# create complete dataframe with imputed data
complete_df = pd.DataFrame(imputer.transform(df))

In [None]:
complete_df.columns = [#insert column names#]
print(complete_df)

In [None]:
completed_df = complete_df.round()
print(completed_df)

In [None]:
# evaluate the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
# define the modeling pipeline
model = RandomForestClassifier()
imputer = IterativeImputer()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])

In [None]:
# split into input and output elements
data = df.values
ix = [i for i in range(data.shape[1])]
X, y = data[:, ix], data[:, 14] # based on 'Variable' column

In [None]:
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Mean Accuracy: %.3f (%.3f)'% (scores.mean(), scores.std()))

In [None]:
# test imputation on one variable
grade_unrounded = complete_df['Variable'].value_counts() #insert veriable name
print("Frequency of Variable")
print(grade_unrounded)
grade_rounded = completed_df['Variable'].value_counts()
print("Frequency of Variable")
print(grade_rounded)

# OVER SAMPLING

In [None]:
pip install imbalanced-learn

In [None]:
pip install delayed

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# split the dataset into input and output features
X = completed_df.drop(axis=1, columns=['Variable']) #drop outcome variable and other variables not to be used in prediction
y = completed_df['Variable'] #enter outcome variable

In [None]:
# create balanced dataset
smote = SMOTE(random_state=1, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)

In [None]:
# check dataset is balanced
print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_smote.shape}''')
print(f'''Shape of y before SMOTE: {y.shape}
Shape of y after SMOTE: {y_smote.shape}''')

print('\nBalance of positive and negative classes (%):')
y_smote.value_counts(normalize=True) * 100

# PREDICTION

In [None]:
# split data into train and test set, default 20% test
# set random_state = 0 for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, stratify=y_smote, random_state=0, test_size = 0.2)

In [None]:
# import necessary libraries for gridsearch and metrics etc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from plotnine import *

# LINEAR MODEL

In [None]:
# fit a linear model
from sklearn.linear_model import LogisticRegression

In [None]:
# create logistic regression model
Log_Reg = LogisticRegression(random_state=0)

In [None]:
# use gridsearchCV to find the best parameters for the linear model
# define values for parameter tuning, change as needed 
penalty = ['l1', 'l2']
C = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.25, 0.5, 0.75, 1.0]
class_weight = [{0:.1, 1:.9}, {0:.2, 1:.8}, {0:.3, 1:.7}, {0:.4, 1:.6}, {0:.5, 1:.5}]
max_iter = [50, 100, 200, 500]

In [None]:
# create Log Reg parameter dictionary for tuning
Log_Reg_param_grid = dict(penalty=penalty,
                          C=C, 
                          class_weight=class_weight,
                          max_iter=max_iter)

In [None]:
# define the Log Reg gridsearchCV object with 5 CV
grid_search_Log_Reg = GridSearchCV(Log_Reg, param_grid=Log_Reg_param_grid, n_jobs=-1, cv=5, scoring='accuracy')

In [None]:
# fit the Log Reg gridsearchCV object
grid_search_Log_Reg.fit(X_train, y_train)

In [None]:
# print best Log Reg parameters and average accuracy score
print("Best parameters: {}".format(grid_search_Log_Reg.best_params_))
print("Best cross-validation average accuracy score: {:.2f}".format(grid_search_Log_Reg.best_score_))

In [None]:
# predict outcome using tuned log reg model on train set
y_pred_Log_Reg_grid_search_train= grid_search_Log_Reg.predict(X_train)

# predict outcome using tuned log reg model on train set
y_pred_Log_Reg_grid_search = grid_search_Log_Reg.predict(X_test)

In [None]:
# print the accuracy scores
print("Accuracy on training set: {:.2f}".format(accuracy_score(y_train, y_pred_Log_Reg_grid_search_train)))
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_pred_Log_Reg_grid_search)))

In [None]:
# evaluate performance metrics of the tuned linear model
# construct the confusion matrix
cm_Log_Reg = confusion_matrix(y_true = y_test, y_pred = y_pred_Log_Reg_grid_search)

# plot confusion matrix
print("Confusion matrix plot of the LogReg:")
print(cm_Log_Reg)
print()
print("Confusion matrix plot of the LogReg:")
disease_labels = ['Outcome Absent', 'Outcome Present']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm_Log_Reg)
plt.title('Confusion matrix of the LogReg:')
fig.colorbar(cax)
ax.set_xticklabels([''] + disease_labels)
ax.set_yticklabels([''] + disease_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# print classification report
print("Precision, Recall, F1-score for class 0 (Outcome Absent) and class 1 (Outcome Present) classes of the test set:")
print("Using logistic regression (l2) model")
print()
print(classification_report(y_test, y_pred_Log_Reg_grid_search))

In [None]:
# plot area under curve for LogReg model
y_pred_proba_Log_Reg = grid_search_Log_Reg.predict_proba(X_test)[:,1]
print("Area under curve for Log Reg:")
print(metrics.roc_auc_score(y_true = y_test, y_score = y_pred_proba_Log_Reg))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_Log_Reg)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve LogReg')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

# TREE BASED MODEL

In [None]:
# fit a gradient boosted tree model
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier(random_state=0)

In [None]:
# use gridsearchCV to find the best parameters for the gradient boosted tree model
# define values for parameter tuning, change as needed  
learning_rate = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.5, 1]
n_estimators = [100, 500, 1000, 2000, 5000]
max_depth = [2, 4, 6, 8, 10]
min_samples_split = [2, 5, 10]

In [None]:
# create gradient boosted tree parameter dictionary for tuning
gbt_param_grid = dict(learning_rate=learning_rate,
                            n_estimators=n_estimators, 
                            max_depth=max_depth,
                            min_samples_split=min_samples_split)

In [None]:
# define the gradient boosted tree gridsearchCV object with 5 CV
grid_search_gbt = GridSearchCV(gbt, param_grid=gbt_param_grid, cv=5, n_jobs=-1, scoring = 'accuracy')

In [None]:
# fit the tree gridsearchCV object
grid_search_gbt.fit(X_train, y_train)

In [None]:
# print best gradient boosted tree parameters and average accuracy score
print("Best parameters: {}".format(grid_search_gbt.best_params_))
print("Best cross-validation average accuracy score: {:.2f}".format(grid_search_gbt.best_score_))

In [None]:
# predict outcome using tuned gradient boosted model on train set
y_pred_gbt_grid_search_train= grid_search_gbt.predict(X_train)

# predict outcome using tuned gradient boost model on train set
y_pred_gbt_grid_search = grid_search_gbt.predict(X_test)

In [None]:
# print the accuracy scores
print("Accuracy on training set: {:.2f}".format(accuracy_score(y_train, y_pred_gbt_grid_search_train)))
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_pred_gbt_grid_search)))

In [None]:
# evaluate performance metrics of the tuned gbt model
# construct the confusion matrix
cm_gbt = confusion_matrix(y_true = y_test, y_pred = y_pred_gbt_grid_search)

# plot confusion matrix
print("Confusion matrix plot of the gbt:")
print(cm_gbt)
print()
print("Confusion matrix plot of the gbt:")
disease_labels = ['Outcome Absent', 'Outcome Present']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm_gbt)
plt.title('Confusion matrix of the gbt:')
fig.colorbar(cax)
ax.set_xticklabels([''] + disease_labels)
ax.set_yticklabels([''] + disease_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# print classification report
print("Precision, Recall, F1-score for class 0 (Outcome Absent) and class 1 (Outcome Present) classes of the test set:")
print("Using gradient boosted tree model")
print()
print(classification_report(y_test, y_pred_gbt_grid_search))

In [None]:
# plot area under curve for gbt model
y_pred_proba_gbt = grid_search_gbt.predict_proba(X_test)[:,1]
print("Area under curve for gradient boosted tree:")
print(metrics.roc_auc_score(y_true = y_test, y_score = y_pred_proba_gbt))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_gbt)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve gradient boosted tree')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

In [None]:
# fit a random forrest tree model
from sklearn.ensemble import RandomForestClassifier 
RF = RandomForestClassifier(random_state=0) 

In [None]:
# use gridsearchCV to find the best parameters for the random forest tree model
# define values for parameter tuning, change as needed  
max_features = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
n_estimators = [1000, 2000, 5000, 10000]
max_depth = [2, 4, 6, 8, 10]
class_weight = [{0:.1, 1:.9}, {0:.2, 1:.8}, {0:.3, 1:.7}, {0:.4, 1:.6}, {0:.5, 1:.5}]

In [None]:
# create random forest tree parameter dictionary for tuning
RF_param_grid = dict(max_features=max_features,
                            n_estimators=n_estimators, 
                            max_depth=max_depth,
                            class_weight=class_weight)

In [None]:
# define the random forest tree gridsearchCV object with 5 CV
grid_search_RF = GridSearchCV(RF, param_grid=RF_param_grid, cv=5, n_jobs=-1, scoring = 'accuracy')

In [None]:
# fit the tree gridsearchCV object
grid_search_RF.fit(X_train, y_train)

In [None]:
# print best random forest tree parameters and average accuracy score
print("Best parameters: {}".format(grid_search_RF.best_params_))
print("Best cross-validation average accuracy score: {:.2f}".format(grid_search_RF.best_score_))

In [None]:
# predict outcome using tuned random forest model on train set
y_pred_RF_grid_search_train= grid_search_RF.predict(X_train)

# predict outcome using tuned RF model on train set
y_pred_RF_grid_search = grid_search_RF.predict(X_test)

In [None]:
# print the accuracy scores
print("Accuracy on training set: {:.2f}".format(accuracy_score(y_train, y_pred_RF_grid_search_train)))
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_pred_RF_grid_search)))

In [None]:
# evaluate performance metrics of the tuned random forest model
# construct the confusion matrix
cm_RF = confusion_matrix(y_true = y_test, y_pred = y_pred_RF_grid_search)

# plot confusion matrix
print("Confusion matrix plot of the RF:")
print(cm_RF)
print()
print("Confusion matrix plot of the RF:")
disease_labels = ['Outcome Absent', 'Outcome Present']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm_RF)
plt.title('Confusion matrix of the RF:')
fig.colorbar(cax)
ax.set_xticklabels([''] + disease_labels)
ax.set_yticklabels([''] + disease_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# print classification report
print("Precision, Recall, F1-score for class 0 (Outcome Absent) and class 1 (Outcome Present) classes of the test set:")
print("Using random forest tree model")
print()
print(classification_report(y_test, y_pred_RF_grid_search))

In [None]:
# plot area under curve for RF model
y_pred_proba_RF = grid_search_RF.predict_proba(X_test)[:,1]
print("Area under curve for random forest tree:")
print(metrics.roc_auc_score(y_true = y_test, y_score = y_pred_proba_RF))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_RF)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve random forest tree')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

# SUPPORT VECTOR MACHINE MODEL 

In [None]:
# fit a SVM model with scaled data 
from sklearn.svm import *
svm = SVC(probability=True, random_state=0)

In [None]:
# use gridsearchCV to find the best parameters for the svm model
# define values for parameter tuning, change as needed  
C = [1/100000, 1/10000, 1/1000, 1/100, 1/10]
kernel = ['linear', 'sigmoid', 'poly', 'rbf'] 
gamma = [1, 1/2, 1/3, 1/5, 1/10, 1/50, 1/100]
class_weight = [{0:.1, 1:.9}, {0:.2, 1:.8}, {0:.3, 1:.7}, {0:.4, 1:.6}, {0:.5, 1:.5}]

In [None]:
# create svm parameter dictionary for tuning
svm_param_grid = dict(C=C, 
                      kernel=kernel, 
                      gamma=gamma,
                      class_weight=class_weight)

In [None]:
# define the svm gridsearchCV object with 5 CV with accuracy
grid_search_svm = GridSearchCV(svm, param_grid=svm_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

In [None]:
# fit the svm gridsearchCV object
grid_search_svm.fit(X_train, y_train)

In [None]:
# print best svm parameters and average accuracy score
print("Best parameters: {}".format(grid_search_svm.best_params_))
print("Best cross-validation average accuracy score: {:.2f}".format(grid_search_svm.best_score_))

In [None]:
# predict outcome using tuned svm model on train set
y_pred_svm_grid_search_train= grid_search_svm.predict(X_train)

# predict outcome using tuned svm model on train set
y_pred_svm_grid_search = grid_search_svm.predict(X_test)

In [None]:
# print the accuracy scores
print("Accuracy on training set: {:.2f}".format(accuracy_score(y_train, y_pred_svm_grid_search_train)))
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_pred_svm_grid_search)))

In [None]:
# evaluate performance metrics of the tuned svm model
# construct the confusion matrix
cm_svm = confusion_matrix(y_true = y_test, y_pred = y_pred_svm_grid_search)
# plot confusion matrix
print("Confusion matrix plot of the svm:")
print(cm_svm)
print()
print("Confusion matrix plot of the svm:")
disease_labels = ['Outcome Absent', 'Outcome Present']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm_svm)
plt.title('Confusion matrix of the svm:')
fig.colorbar(cax)
ax.set_xticklabels([''] + disease_labels)
ax.set_yticklabels([''] + disease_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Print classification report
print("Precision, Recall, F1-score for class 0 (Outcome Absent) and class 1 (Outcome Present) classes of the test set:")
print("Using svm model")
print()
print(classification_report(y_test, y_pred_svm_grid_search))

In [None]:
# plot area under curve for svm model
y_pred_proba_svm = grid_search_svm.predict_proba(X_test)[:,1]
print("Area under curve for svm:")
print(metrics.roc_auc_score(y_true = y_test, y_score = y_pred_proba_svm))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_svm)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve svm')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

# ARTIFICIAL NEURAL NETWORK

In [None]:
pip install keras

In [None]:
pip install tensorflow --user

In [None]:
import keras
from keras import models
from keras import layers
from keras.models import Sequential
from keras.constraints import maxnorm
from keras.layers import Dense, Dropout, Activation

In [None]:
# fix random seed to enable experiment to be reproducible, randomly selected 2
from numpy.random import seed
seed = 2
import tensorflow as tf
tf.random.set_seed(2)

In [None]:
# create and compile ANN model with randomly selected parameters, parameters will be tuned later
def create_model(dropout_rate=0.1, 
                 weight_constraint=1, 
                 optimizer='sgd',
                 learn_rate=0.01,
                 momentum=0.1,
                 neurons=10,
                 activation='relu'):
    model = Sequential()
    # input_dim = number of imput variables
    model.add(Dense(neurons, input_dim=23, kernel_initializer='uniform', activation='linear', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# wrap Keras model so it can be used by scikit-learn 
from keras.wrappers.scikit_learn import KerasClassifier
ann = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
# define values for each hyperparameter for tuning, change as needed 
dropout_rate = [0.005, 0.01, 0.05]
epochs = [5, 10, 15, 20]
batch_size = [50, 100, 500, 1000]
optimizer = ['SGD'] # select SGD as default for time cost reduction
learn_rate = [0.001, 0.01, 0.1]
momentum = [0.0001, 0.001, 0.01, 0.1]
neurons = [5, 10, 15, 20]
activation = ['sigmoid'] # select sigmoid for binary output

ann_param_grid = dict(dropout_rate=dropout_rate, 
                  batch_size=batch_size, 
                  epochs=epochs,
                  optimizer=optimizer, 
                  learn_rate=learn_rate, 
                  momentum=momentum, 
                  neurons=neurons, 
                  activation=activation)

In [None]:
# define the svm gridsearchCV object with 5 CV with accuracy score
grid_search_ann = GridSearchCV(ann, param_grid=ann_param_grid, cv=5, n_jobs=-1, scoring='accuracy')

In [None]:
# fit grid search to training set
grid_search_ann.fit(X_train, y_train)

In [None]:
# print best ann parameters and average accuracy score
print("Best parameters: {}".format(grid_search_ann.best_params_))
print("Best cross-validation average accuracy score: {:.2f}".format(grid_search_ann.best_score_))

In [None]:
# predict outcome using tuned ann model on train set
y_pred_ann_grid_search_train= grid_search_ann.predict(X_train)

# predict outcome using tuned ann model on train set
y_pred_ann_grid_search = grid_search_ann.predict(X_test)

In [None]:
# print accuracy scores
print("Accuracy on training set: {:.2f}".format(accuracy_score(y_train, y_pred_ann_grid_search_train)))
print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test, y_pred_ann_grid_search)))

In [None]:
# evaluate performance metrics of the tuned ann model
# construct the confusion matrix
cm_ann = confusion_matrix(y_true = y_test, y_pred = y_pred_ann_grid_search)
# plot confusion matrix
print("Confusion matrix plot of the ann:")
print(cm_ann)
print()
print("Confusion matrix plot of the ann:")
disease_labels = ['Outcome Absent', 'Outcome Present']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm_ann)
plt.title('Confusion matrix of the ann:')
fig.colorbar(cax)
ax.set_xticklabels([''] + disease_labels)
ax.set_yticklabels([''] + disease_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# print classification report
print("Precision, Recall, F1-score for class 0 (Outcome Absent) and class 1 (Outcome Present) classes of the test set:")
print("Using ann model")
print()
print(classification_report(y_test, y_pred_ann_grid_search))

In [None]:
# plot area under curve for ann model
y_pred_proba_ann = grid_search_ann.predict_proba(X_test)[:,1]
print("Area under curve for ann:")
print(metrics.roc_auc_score(y_true = y_test, y_score = y_pred_proba_ann))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_ann)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve ann')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

# VISUALISE PREDICTORS USING DECISION TREE

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)

In [None]:
print("Accuracy on training set: {:.3f}".format(decision_tree.score(X_train, y_train))) 
print("Accuracy on test set: {:.3f}".format(decision_tree.score(X_test, y_test)))

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(decision_tree, out_file="tree.dot", class_names=["Outcome Present", "Outcome Absent"], feature_names=X.columns, impurity=False, filled=True)

In [None]:
# need to also install graphviz on system https://www.graphviz.org/download/ 

In [None]:
pip install graphviz

In [None]:
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin'#'C:/Users/............../graphviz-2.38/release/bin/'
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
print("Feature importances:\n{}".format(decision_tree.feature_importances_))

In [None]:
df_feature_importance = pd.DataFrame(decision_tree.feature_importances_, index=X_smote.columns, columns=['feature importance']).sort_values('feature importance', ascending=True)
df_feature_importance

In [None]:
ax = df_feature_importance.plot.barh()