# Customer Churn
__Author__: David O'Donnell

---

## Problem Statement and Summary 

Description of the problem and translation of business problem to machine learning problem

---

## Table of Contents

---

# Import Data and Packages

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn as default template for plots
sns.set()

# Expand pandas display output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [92]:
# Load dataset from CSV
path = '/Users/davidodonnell/Documents/Under Armour Working Sample/'
filename = 'train.csv'

df = pd.read_csv(path+filename, header=0, parse_dates=True).rename(columns={'apparell_spend': 'apparel_spend'})

---

## View Data

In [93]:
# View Columns and Memory Usage
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   last_purchase    2666 non-null   float64
 1   max_discount     2665 non-null   float64
 2   shoe_spend       2666 non-null   float64
 3   apparel_spend    2666 non-null   object 
 4   acc_spend        2666 non-null   int64  
 5   custserv_calls   2666 non-null   int64  
 6   churn            2666 non-null   int64  
 7   acc_purchasers   2666 non-null   int64  
 8   promo_purchaser  2666 non-null   int64  
 9   shoe_orders      2666 non-null   int64  
 10  apparel_orders   2666 non-null   int64  
 11  acc_orders       2666 non-null   int64  
 12  gender           2666 non-null   object 
 13  ecommShopper     2666 non-null   bool   
 14  bhShopper        2666 non-null   bool   
 15  state            2666 non-null   object 
 16  area_code        2666 non-null   int64  
 17  phone         

In [94]:
# View First Ten Lines of Data
df.head(10)

Unnamed: 0,last_purchase,max_discount,shoe_spend,apparel_spend,acc_spend,custserv_calls,churn,acc_purchasers,promo_purchaser,shoe_orders,apparel_orders,acc_orders,gender,ecommShopper,bhShopper,state,area_code,phone
0,56.5,0.26,322.2,194.3,126,1,0,0,1,3,2,4,Male,False,True,MS,510,402-5509
1,84.0,0.46,279.1,170.9,92,0,0,0,1,2,2,3,Male,False,False,OH,510,370-3021
2,96.0,0.0,294.7,306.0,96,1,1,0,0,2,3,3,Female,True,False,MI,415,373-1448
3,62.0,0.0,255.4,185.6,100,2,0,0,0,2,2,3,Male,False,False,VT,510,403-1769
4,45.0,0.28,300.6,197.9,154,0,0,0,1,3,2,5,Male,False,True,WV,408,405-9384
5,68.5,0.0,243.4,217.0,96,0,0,0,0,2,3,3,Male,True,False,FL,408,384-6654
6,58.0,0.0,241.9,137.9,77,1,1,1,0,2,2,3,Female,False,True,MA,408,371-9457
7,62.0,0.26,226.3,220.3,66,1,0,0,1,2,3,2,Female,False,False,WA,415,380-6631
8,101.5,0.0,180.7,97.8,93,1,0,0,0,1,2,3,Male,False,True,NY,510,379-2991
9,80.0,0.32,159.7,124.4,106,2,0,1,1,1,2,3,Female,False,True,ID,415,345-5980


In [95]:
# View Numerical Data
df.describe()

Unnamed: 0,last_purchase,max_discount,shoe_spend,acc_spend,custserv_calls,churn,acc_purchasers,promo_purchaser,shoe_orders,apparel_orders,acc_orders,area_code
count,2666.0,2665.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0
mean,70.961553,0.081418,245.565304,102.430983,1.556264,0.149287,0.097524,0.277944,2.044261,2.15979,3.251688,437.737059
std,19.978791,0.136785,54.53819,27.921276,1.312955,0.356438,0.296726,0.44807,0.534996,0.495355,0.81683,42.679743
min,0.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,408.0
25%,57.625,0.0,208.5,85.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,415.0
50%,71.5,0.0,245.35,103.0,1.0,0.0,0.0,0.0,2.0,2.0,3.0,415.0
75%,84.5,0.2,282.675,121.0,2.0,0.0,0.0,1.0,2.0,2.0,4.0,510.0
max,121.0,0.5,411.8,200.0,9.0,1.0,1.0,1.0,3.0,3.0,6.0,510.0


In [96]:
# Number of Unique Values per Column
for col in df.columns:
    print(F'{col}: {df[col].nunique()}')

last_purchase: 210
max_discount: 44
shoe_spend: 1507
apparel_spend: 1447
acc_spend: 160
custserv_calls: 10
churn: 2
acc_purchasers: 2
promo_purchaser: 2
shoe_orders: 3
apparel_orders: 3
acc_orders: 7
gender: 2
ecommShopper: 2
bhShopper: 2
state: 53
area_code: 3
phone: 2666


---

# Cleanse Data

## Clean Up Data (Types and Errors)

In [97]:
# Improper value in apparell_spend
df.loc[df['apparel_spend']=='a']

# Impute errant value with median apparell_spend/apparel_orders * apparel_orders
median_apparel_spend_order = df[['apparel_spend']].loc[df['apparel_spend']!='a'].median().values//df[['apparel_orders']].loc[df['apparel_spend']!='a'].median().values

df['apparel_spend'] = np.where(df['apparel_spend']!='a', df['apparel_spend'], median_apparell_spend_order*df['apparel_orders'])

### Adjust Column Datatypes

In [98]:
# Format Continuous Variables as Floats
float_cols = [
    'last_purchase',
    'max_discount',
    'shoe_spend',
    'apparel_spend',
    'acc_spend'
]

for col in float_cols:
    df[col] = df[col].astype(str).str.replace(',', '')
    df[col] = df[col].astype(str).str.replace('$', '')
    df[col] = df[col].astype(float, errors='ignore')


# Format Continuous Variables as Floats
int_cols = [
    'custserv_calls',
    'shoe_orders',
    'apparel_orders',
    'acc_orders'
]

for col in int_cols:
    df[col] = df[col].astype(str).str.replace(',', '')
    df[col] = df[col].astype(str).str.replace('$', '')
    df[col] = df[col].astype(float).round(0).astype(int)


# Format Continuous Variables as Floats
bool_cols = [
    'churn',
    'acc_purchasers',
    'promo_purchaser',
    'ecommShopper',
    'bhShopper'
]

for col in bool_cols:
    df[col] = df[col].astype(bool)

# Format Categorical Variables
ex_cols = [] # Other columns that shouldn't be converted to a specific datatype
exclude_columns = float_cols+int_cols+bool_cols+ex_cols


# Categorical Data Types
n = 100 # Maximum number of unique values in a column to be classified as "category"

for col in df.columns:
    if df[col].nunique() < n and col not in exclude_columns:
        df[col] = df[col].astype('category')

In [99]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   last_purchase    2666 non-null   float64 
 1   max_discount     2665 non-null   float64 
 2   shoe_spend       2666 non-null   float64 
 3   apparel_spend    2666 non-null   float64 
 4   acc_spend        2666 non-null   float64 
 5   custserv_calls   2666 non-null   int64   
 6   churn            2666 non-null   bool    
 7   acc_purchasers   2666 non-null   bool    
 8   promo_purchaser  2666 non-null   bool    
 9   shoe_orders      2666 non-null   int64   
 10  apparel_orders   2666 non-null   int64   
 11  acc_orders       2666 non-null   int64   
 12  gender           2666 non-null   category
 13  ecommShopper     2666 non-null   bool    
 14  bhShopper        2666 non-null   bool    
 15  state            2666 non-null   category
 16  area_code        2666 non-null   category


---

# Exploratory Data Analysis

## Frequency Table
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
table = pd.crosstab(index=df['col'], columns=df['col'], margins=True)

## Categorical Data Plots

## Factor Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.factorplot(x='col', y='col', data=df, kind='plot type')

## Count Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.countplot(x='col', data=df)

## Bar Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.barplot(x='col', y='col', data=df)

## Box Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
# Example
sns.boxplot(x='col', y='col', data=df)

## Violin Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.violinplot(x='col', y='col', data=df)

## Strip Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.stripplot(x='col', y='col', data=df)

## Swarm Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.swarmplot(x='col', y='col', data=df)

## Distribution Plots

## Distribution Plot/Histogram
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.distplot(df['col'], kde=False, color='red', bins=30)

## Join Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.joinplot(x='col', y='col', data=df)

## Density Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.kdeplot(df['col'])

## Pair Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.pairplot(df, hue='col')

## Rug Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.rugplot(df['col'])

## Relational Plots

## Relational Plots
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.relplot(x='col', y='col', data=df, kind='plot type') #default plot type is a scatterplot

## Scatterplot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.scatterplot(x='col', y='col', data=df)

## Linear Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.lmplot(x='col', y='col', data=df, hue='col')

## Correlation Matrix
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.heatmap(df.corr())

## Line Plot
### Description:
* Insert description here

### Use Cases:
* Use Case #1, Use Case #2, etc.

### Example:

In [None]:
sns.lineplot(x='col', y='col', data=df)

---

# Data Manipulation and Additional Cleaning

### Handling Missing Data and Null Values

In [None]:
# Examine Missing and Null Values
df.isnull().sum()

In [None]:
# Drop Observations with Null Values (based on Column/Column Type)
df = df.dropna(how='all')
df = df.dropna(subset=[col])

### Outlier Treatment

In [None]:
def RemoveOutliers(data, col):
    '''
    Removes outliers from the specified column
    data: DataFrame
    col: Column to filter data
    '''
    u = np.median(data[col])
    s = np.std(data[col])
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3-q1
    
    filtered = data.loc[(data[col]>q1-iqr)
                        &
                        (data[col]<q3+iqr)
                       ]
    return filtered


def ImputeOutliers(data, col):
    '''
    Imputes outliers with the median of the specified column
    data: DataFrame
    col: Column to filter data
    '''
    u = np.median(data[col])
    s = np.std(data[col])
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3-q1

    data[col] = np.where((data[col]>q1-iqr)|(data[col]>q1-iqr),
                         u,
                         data[col])
    filtered = data
    return filtered


def FilterData(data, col, minimum=0, maximum=float('inf')):
    '''
    data: DataFrame
    col: Column to filter data
    minimum: Defaults to 0, but any number can be input
    maximum: Defaults to inf, but any number can be input
    '''
    filtered = data.loc[data[col]>=minimum]
    filtered = filtered.loc[filtered[col]<=maximum]
    return filtered

---

# Find Additional Insights, Visualize Data, and Determine Appropriate Features

---

# Build and Evaluate Model

In [100]:
# Import Machine Learning Packages
# Preprocessing:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Model Selection/Hyperparameter Tuning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score, roc_auc_score, classification_report, accuracy_score, f1_score
from bayes_opt import BayesianOptimization

## Define Variables

In [105]:
# Determine dependent and independent variables based on dataset
y = df['churn']
X = df.drop(['churn','area_code','phone'], axis=1)

## Normalize/Standardize Data (Set-Up Pipeline Steps and Hyperparameter Space)


In [106]:
# Preprocessing Pipeline
numeric_features = list(X.select_dtypes(include=['number']))
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_features = list(X.select_dtypes(include=['category']))
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)]
)

## Build Initial Model (Classification or Regression)

### Model Types

#### Classification
    Logistic Regression - description

    Random Forest Classifier - description

    Gradient Boosting Classifier - description

    XGBoost Classifier - description


#### Regression
    Linear Regression - description

    Ridge Regression - description

    Lasso Regression - description

    Elastic Net Regression - description

    Random Forest Regressor - description

    Gradient Boosting Regressor - description

    XGBoost Regressor - description

### Hyperparameters for Tuning

#### Classification
    Logistic Regression - description

    Random Forest Classifier - description

    Gradient Boosting Classifier - description

    XGBoost Classifier - description


#### Regression
    Linear Regression - description

    Ridge Regression - description

    Lasso Regression - description

    Elastic Net Regression - description

    Random Forest Regressor - description

    Gradient Boosting Regressor - description

    XGBoost Regressor - description

In [107]:
# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Instantiate Baseline Classification Models
classifiers = [LogisticRegression(), RandomForestClassifier(n_estimators=100), GradientBoostingClassifier(), 
               XGBClassifier(objective='binary:logistic', eval_metric='auc')]

# Fit to the training set
for clf in classifiers:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    
    # Scoring Metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    roc_score = roc_auc_score(y_test, predictions)
    acc = accuracy_score(y_test, predictions)
    class_report = classification_report(y_test, predictions)

    roc_score = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc').mean()
    precision = cross_val_score(clf, X_train, y_train, cv=5, scoring='precision').mean()
    recall = cross_val_score(clf, X_train, y_train, cv=5, scoring='recall').mean()
    f1 = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1').mean()

    print(pipeline.named_steps['classifier'].__class__.__name__)
    print(' Root Mean Squared Error: {}'.format(rmse))
    print(' Mean Absolute Error: {}'.format(mae))
    print(' AUC Score: {}'.format(roc_score))
    print(' Accuracy: {}'.format(acc))
    print(' Classification Report:\n{}'.format(class_report))
    print('\n')

TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [None]:
# Instantiate Baseline Regression Models
regressors = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), RandomForestRegressor(n_estimators=100), 
              GradientBoostingRegressor(), xgb.XGBRegressor(objective='reg:squarederror', eval_metric='mae')]

# Split Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fit to the training set
for reg in regressors:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', reg)
    ])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    
    # Scoring Metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    roc_score = roc_auc_score(y_test, predictions)
    acc = accuracy_score(y_test, predictions)
    class_report = classification_report(y_test, predictions)

    print(pipeline.named_steps['classifier'].__class__.__name__)
    print(' Root Mean Squared Error: {}'.format(rmse))
    print(' Mean Absolute Error: {}'.format(mae))
    print(' AUC Score: {}'.format(roc_score))
    print(' Accuracy: {}'.format(acc))
    print(' Classification Report:\n{}'.format(class_report))
    print('\n')

---

## Hyperparameter Tuning

### RandomizedSearchCV
#### Use RandomizedSearchCV to define a grid of hyperparameter ranges

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

classifier = RandomForestClassifier()
regressor = RandomForestRegressor()

# Create Pipeline Object
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
    ('classifier', classifier),
    ('regressor', regressor)
])

# Use RandomizedSearchCV for Initial Hyperparameter Examination
parameters = {
    'classifier__learning_rate': np.linspace(0,1,11),
    'classifier__max_depth': range(2,20,2),
    'classifier__n_estimators': range(50,500,50),
    'classifier__min_samples_split': np.linspace(0.1,1,10),
    'classifier__min_samples_leaf': np.linspace(0.1,0.5,9),
    'classifier__max_features': ['log2', 'sqrt'],
    'classifier__bootstrap': [True, False],
}

parameters = {
    'regressor__learning_rate': np.linspace(0,1,11),
    'regressor__max_depth': range(2,20,2),
    'regressor__n_estimators': range(50,500,50),
    'regressor__min_samples_split': np.linspace(0.1,1,10),
    'regressor__min_samples_leaf': np.linspace(0.1,0.5,9),
    'regressor__max_features': ['log2', 'sqrt'],
    'regressor__bootstrap': [True, False],
}

model = RandomizedSearchCV(pipeline, parameters, n_iter=100, cv=3, random_state=42, n_jobs=-1)

model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.best_params_)

### GridSearchCV
#### Use GridSearchCV to determine the optimal hyperparameters based on the RandomSearchCV results

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

classifier = RandomForestClassifier()
regressor = RandomForestRegressor()

# Create Pipeline Object
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
    ('classifier', classifier),
    ('regressor', regressor)
])

# Use GridSearchCV for Hyperparameter Tuning
parameters = {
    'classifier__learning_rate': [],
    'classifier__max_depth': [],
    'classifier__n_estimators': [],
    'classifier__min_samples_split': [],
    'classifier__min_samples_leaf': [],
    'classifier__max_features': [],
    'classifier__bootstrap': [],
}

parameters = {
    'regressor__learning_rate': [],
    'regressor__max_depth': [],
    'regressor__n_estimators': [],
    'regressor__min_samples_split': [],
    'regressor__min_samples_leaf': [],
    'regressor__max_features': [],
    'regressor__bootstrap': [],
}

model = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1)

model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.best_params_)

In [None]:
# Extracting the best parameters from model
parameters = model.best_params_
model = model.best_estimator_
print(model)

# Fit Model to Training Set
model.fit(X_train, y_train)

# Evaluate Model Performance
print(model)

predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
roc_score = roc_auc_score(y_test, predictions)
acc = accuracy_score(y_test, predictions)
class_report = classification_report(y_test, predictions)

print("Root Mean Squared Error: {}".format(rmse))
print("Mean Absolute Error: {}".format(mae))
print("AUC Score: {}".format(roc_score))
print("Accuracy: {}".format(acc))
print("Classification Report:\n{}".format(class_report))
print('\n')

---

## Examine Feature Importances

In [None]:
# Determine Variables (One Hot Encoded Categorical Variables and Numeric Variables)
onehot_columns = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_features)

feature_importance = pd.Series(data=model.named_steps['classifier'].feature_importances_, index = np.array(numeric_features + list(onehot_columns)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Examine Feature Importance/Correlation Coefficients
columns = np.array(numeric_features + list(onehot_columns))

categories = [i.split('_', 1)[0] for i in columns]

importance_df = pd.DataFrame({
    'variable': columns, 
    'importance': model.named_steps['classifier'].feature_importances_,
    'category': categories
})

# Feature Importances DataFrame
cat_importance_df = importance_df.groupby('category').sum().sort_values(by='importance', ascending=False)
print("Feature Importances:\n{}".format(cat_importance_df))

# Plot Feature Importances
sns.barplot(x='importance', y=cat_importance_df.index, data=cat_importance_df)
plt.title('Feature Importances')
plt.show()

---

## Create Machine Learning Pipeline Library

In [None]:
import pickle

# Dump the Model into a Machine Learning Pipeline Library
filename = 'model.sav'

pickle.dump(model, open(filename, 'wb'))

---

## Apply Model to New Data

In [None]:
# Apply Model to New Data
new_df = pd.DataFrame()
predicted_df = pd.DataFrame(model.predict_proba(new_df))

# Concatenate New Dataframe with Predictions
predicted_df = pd.concat([new_df.reset_index(), predicted_df], axis=1)

---

# Appendix

### Bayesian Optimization for XGBoost

In [None]:
data_dmatrix = xgb.DMatrix(data=X, label=y)

#Bayesian Optimization function for xgboost
#specify parameters to tune as keyword arguments
def bo_tune_xgb(max_depth, gamma, n_estimators ,learning_rate):
    parameters = {'max_depth': int(max_depth),
                  'gamma': gamma,
                  'n_estimators': int(n_estimators),
                  'learning_rate':learning_rate,
                  'subsample': 0.8,
                  'eta': 0.1,
                  'eval_metric': 'auc'}
    #Cross validating with the specified parameters in 3 folds and 20 iterations
    cv_result = xgb.cv(parameters, data_dmatrix, num_boost_round=20, nfold=3)
    #Return the AUC Score
    return cv_result['test-auc-mean'].iloc[-1]

#Invoking the Bayesian Optimizer with the specified parameters to tune
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (2, 10),
                                            'gamma': (0, 1),
                                            'learning_rate':(0,1),
                                            'n_estimators':(100,250)})

# Perform Bayesian optimization (20 iterations with 5 steps of random exploration)
xgb_bo.maximize(n_iter=20, init_points=5, acq='ei')

In [None]:
# Extracting the best parameters
parameters = xgb_bo.max['params']
print(parameters)

# Converting the max_depth and n_estimator values from float to int
parameters['max_depth']= int(parameters['max_depth'])
parameters['n_estimators']= int(parameters['n_estimators'])

In [None]:
# Fit Model to Training Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9999)

#Initialize an XGBClassifier with the tuned parameters and fit the training data
model = XGBClassifier(**parameters).fit(X_train, y_train)


---

## Examine Feature Importances

In [None]:
# Examine Feature Importance/Correlation Coefficients
categories = [i.split('__', 1)[0] for i in columns]

importance_df = pd.DataFrame({'variable': columns, 
                              'importance': model.feature_importances_,
                              'category': categories})

print(importance_df.groupby('category').sum().sort_values(by='importance', ascending=False))

---

# References

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

scaler = scaler.fit(train_df[metrics])

train_df.loc[:,metrics] = scaler.transform(train_df[metrics])
test_df.loc[:,metrics] = scaler.transform(test_df[metrics])

In [None]:
import random

# Set Random Seed
random.seed(42)

# Train/Test Split Thresholds
train_pct = 0.9

# Shuffle Devices
n_devices = nn_df.index.get_level_values('device').nunique()
shuffled = random.sample(list(nn_df.index.get_level_values('device').unique()), n_devices)

# Bucket Devices into Train/Test Sets
train_devices = shuffled[:int(n_devices*train_pct)]
test_devices = shuffled[int(n_devices*train_pct):]

# Create Train/Test Dataframes
train_df = nn_df.loc[train_devices]
test_df = nn_df.loc[test_devices]

print(f'Devices in Training Set: {len(train_devices)}')
print(f'Devices in Test Set: {len(test_devices)}')

In [None]:
def dataset_generator(predictors, target, time_steps=1, step=1):
        X, y = [], []
        for i in range(0, len(predictors) - time_steps, step):
            v = predictors.iloc[i:(i+time_steps)].values
            z = target.iloc[i:i+time_steps]
            X.append(v)
            y.append(z[0])
        return np.array(X), np.array(y)

In [None]:
time_steps = 5
step = 1

X_train, y_train = dataset_generator(
    train_df[metrics],
    train_df['failure'],
    time_steps,
    step
)

X_test, y_test = dataset_generator(
    test_df[metrics],
    test_df['failure'],
    time_steps,
    step
)