In [None]:
# setting the random seed for reproducibility
import random
random.seed(493)

# for manipulating dataframes
import pandas as pd
import numpy as np

# for modeling
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
import xgboost as xgb
from xgboost import XGBClassifier
import shap

# for visualizations
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# print the JS visualization code to the notebook
shap.initjs()

In [None]:
# # Read a csv file
# df = pd.read_csv('../data/in/OnlineNewsPopularity.csv')

In [None]:
# df.head()
# df.info()
# df.shape

In [None]:
def show_missing(df):
    """
    Takes a dataframe and returns a dataframe with stats
    on missing and null values with their percentages.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    dfx = pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})
    return dfx

# show_missing(df)

In [None]:
# df.columns

In [None]:
# for col in df.columns:
#     df = df.rename(columns={col:(col.strip(' '))})

# df = df.rename(columns={'self_reference_avg_sharess':'self_reference_avg_shares'})

# df = df.drop_duplicates(keep = False)

# # creates a new column for the new target variable and drop the old one
# df['target'] = np.where(df['shares'] > 1400, '1', '0')
# df = df.drop(columns=['shares'])

# df.to_csv('../data/out/online_news_popularity_clean.csv', index=False)

In [None]:
# Read a csv file
df = pd.read_csv('../data/out/online_news_popularity_clean.csv')

In [None]:
df.head()
df.info()
df.shape

In [None]:
sns.countplot(x='target', data=df)

In [None]:
final_df = df.drop(columns=['url', 'timedelta'])

In [None]:
X = final_df.loc[:, final_df.columns != 'target']
y = final_df.loc[:, final_df.columns == 'target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=493)

In [None]:
xgbc = XGBClassifier(colsample_bytree=1,
                     gamma=0,
                     learning_rate=0.1,
                     max_depth=4,
                     n_estimators=200,
                     tree_method = 'gpu_hist'
                    ) 

In [None]:
xgbc.fit(X_train, y_train)

In [None]:
xgbc_roc_auc = roc_auc_score(y_test, xgbc.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, xgbc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Gradient Boosting (area = %0.2f)' % xgbc_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
# this takes a minute or two since we are explaining over 30 thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(xgbc)
shap_values = explainer.shap_values(X)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[:1000,:], X_train.iloc[:1000,:])

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
for col in X_train.columns:
    shap.dependence_plot(col, shap_values, X)

In [None]:
#set up plotting area
plt.figure(0).clf()
plt.plot(fpr,tpr,label="Logistic Regression, AUC=" + str(logit_roc_auc))
plt.plot(fpr,tpr,label="Gradient Boosting, AUC=" + str(xgbc_roc_auc))
plt.legend()