# CASE STUDY - unsupervised learning


In [1]:
!pip install joblib



In [2]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /opt/conda/envs/Python36/lib/python3.6/site-packages (0.6.2)


In [3]:
import os
import joblib
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import BayesianGaussianMixture
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import imblearn.pipeline as pl
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SVMSMOTE
    
plt.style.use('seaborn')
%matplotlib inline

Using TensorFlow backend.


## Make this notebook run in IBM Watson

In [4]:
# The code was removed by Watson Studio for sharing.

In [5]:
# START CODE BLOCK
# cos2file - takes an object from Cloud Object Storage and writes it to file on container file system.
# Uses the IBM project_lib library.
# See https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/project-lib-python.html
# Arguments:
# p: project object defined in project token
# data_path: the directory to write the file
# filename: name of the file in COS

import os
def cos2file(p,data_path,filename):
    data_dir = p.project_context.home + data_path
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    open( data_dir + '/' + filename, 'wb').write(p.get_file(filename).read())

# file2cos - takes file on container file system and writes it to an object in Cloud Object Storage.
# Uses the IBM project_lib library.
# See https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/project-lib-python.html
# Arguments:
# p: prooject object defined in project token
# data_path: the directory to read the file from
# filename: name of the file on container file system

import os
def file2cos(p,data_path,filename):
    data_dir = p.project_context.home + data_path
    path_to_file = data_dir + '/' + filename
    if os.path.exists(path_to_file):
        file_object = open(path_to_file, 'rb')
        p.save_data(filename, file_object, set_project_asset=True, overwrite=True)
    else:
        print("file2cos error: File not found")
# END CODE BLOCK

In [6]:
cos2file(project, '/data', 'aavail-target.csv')

## Synopsis

  > We are now going to predict customer retention.  There are many models and many transforms to consider.  Use your
    knowledge of pipelines and functions to ensure that your code makes it easy to compare and iterate.  
    
  > Marketing has asked you to make a report on customer retention.  They would like you to come up with information     that can be used to improve current marketing strategy efforts.  The current plan is for marketing at AAVAIL to
    collect more features on subscribers the and they would like to use your report as a proof-of-concept in order to     get buyin for this effort.
  
## Outline

1. Create a churn prediction baseline model
2. Use clustering as part of your prediction pipeline
3. 
4. Run and experiment to see if re-sampling techniques improve your model

## Data

Here we load the data as we have already done.

`aavail-target.csv`

In [7]:
data_dir = os.path.join("..","data")
df = pd.read_csv(os.path.join(data_dir, r"aavail-target.csv"))
df.head()

Unnamed: 0,customer_id,is_subscriber,country,age,customer_name,subscriber_type,num_streams
0,1,1,united_states,21,Kasen Todd,aavail_premium,23
1,2,0,singapore,30,Ensley Garza,aavail_unlimited,12
2,3,0,united_states,21,Lillian Carey,aavail_premium,22
3,4,1,united_states,20,Beau Christensen,aavail_basic,19
4,5,1,singapore,21,Ernesto Gibson,aavail_premium,23


In [8]:
## pull out the target and remove uneeded columns
_y = df.pop('is_subscriber')
y = np.zeros(_y.size)
y[_y==0] = 1 
df.drop(columns=['customer_id','customer_name'], inplace=True)
df.head()

Unnamed: 0,country,age,subscriber_type,num_streams
0,united_states,21,aavail_premium,23
1,singapore,30,aavail_unlimited,12
2,united_states,21,aavail_premium,22
3,united_states,20,aavail_basic,19
4,singapore,21,aavail_premium,23


### QUESTION 1

Create a stratified train test split of the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, stratify=y, random_state=1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(750, 4) (750,)
(250, 4) (250,)


### QUESTION 2

Create a baseline model.  We are going to test whether clustering followed by a model improves the results.  The we will test whether re-sampling techniques provide improvements.  Use a pipeline or another method, but create a baseline model given the data. Here is the ColumnTransformer we have used before.

In [10]:
## preprocessing pipeline
numeric_features = ['age', 'num_streams']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['country', 'subscriber_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [11]:
best_params = {}

# Logistic Regression
pipe_log = Pipeline([("prep", preprocessor), ("log", LogisticRegression())])

param_grid_log = [{
    'log__C': [0.01,0.1,0.5,1.0,1.5,5.0,10.0],
    'log__penalty': ["l1", "l2"]
}]

grid_search_log = GridSearchCV(pipe_log, param_grid=param_grid_log, cv=5, n_jobs=-1)
grid_search_log.fit(X_train, y_train)

y_pred = grid_search_log.predict(X_test)
print("-->".join(pipe_log.named_steps.keys()))
best_params = grid_search_log.best_params_
print("f1_score", round(f1_score(y_test, y_pred,average='binary'),3))


# SVM
pipe_svm = Pipeline([("prep", preprocessor), ("svm", SVC(kernel='rbf', class_weight='balanced'))])

param_grid_svm = [{
    'svm__C': [0.01,0.1,0.5,1.0,1.5,5.0,10.0],
    'svm__gamma': [0.001,0.01,0.1]
}]

grid_search_svm = GridSearchCV(pipe_svm, param_grid=param_grid_svm, cv=5, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

y_pred = grid_search_svm.predict(X_test)
print("-->".join(pipe_svm.named_steps.keys()))
best_params = dict(best_params, **grid_search_svm.best_params_)
print("f1_score", round(f1_score(y_test, y_pred, average='binary'),3))


# Random Forest
pipe_rf = Pipeline([("prep", preprocessor), ("rf", RandomForestClassifier())])

param_grid_rf = {
    'rf__n_estimators': [20,50,100,150],
    'rf__max_depth': [4, 5, 6, 7, 8],
    'rf__criterion': ['gini', 'entropy']
}

grid_search_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

y_pred = grid_search_rf.predict(X_test)
print("-->".join(pipe_rf.named_steps.keys()))
best_params = dict(best_params, **grid_search_rf.best_params_)
print("f1_score",round(f1_score(y_test, y_pred,average='binary'),3))

###
best_params

prep-->log
f1_score 0.562
prep-->svm
f1_score 0.609
prep-->rf
f1_score 0.533


{'log__C': 0.1,
 'log__penalty': 'l2',
 'svm__C': 5.0,
 'svm__gamma': 0.1,
 'rf__criterion': 'gini',
 'rf__max_depth': 4,
 'rf__n_estimators': 20}

### QUESTION 3

The next part is to create version of the classifier that uses identified clusters.  Here is a class to get you started.  It is a transformer like those that we have been working with.  There is an example of how to use it just below.  In this example 4 clusters were specified and their one-hot encoded versions were appended to the feature matrix.  Now using pipelines and/or functions compare the performance using cluster profiling as part of your matrix to the baseline.  You may compare multiple models and multiple clustering algorithms here.

In [12]:
class KmeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k=4):
        self.km = KMeans(n_clusters=k, n_init=20)
        
    def transform(self, X, *_):
        labels = self.km.predict(X)
        enc = OneHotEncoder(categories='auto')
        oh_labels = enc.fit_transform(labels.reshape(-1,1))
        oh_labels = oh_labels.todense()
        return(np.hstack((X,oh_labels)))

    def fit(self,X,y=None,*_):
        self.km.fit(X)
        labels = self.km.predict(X)
        self.silhouette_score = round(silhouette_score(X,labels,metric='mahalanobis'),3)
        return(self)

class GmmTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, k=4):
        self.gmm = BayesianGaussianMixture(n_components=k,covariance_type='full', max_iter=500, n_init=10, warm_start=True)        
    def transform(self, X,*_):
        probs = self.gmm.predict_proba(X) + np.finfo(float).eps
        return(np.hstack((X,probs)))
        
    def fit(self,X,y=None,*_):
        self.gmm.fit(X)
        labels = self.gmm.predict(X)
        self.silhouette_score = round(silhouette_score(X,labels,metric='mahalanobis'),3)
        return(self)
    
## example for GMM  
preprocessor.fit(X_train)
X_train_pre = preprocessor.transform(X_train)    
gt = GmmTransformer(4)
gt.fit(X_train_pre)
X_train_gmm = gt.transform(X_train_pre)
print(X_train_pre.shape)  
print(X_train_gmm.shape)

## example for kmeans
preprocessor.fit(X_train)
X_train_pre = preprocessor.transform(X_train)    
kt = KmeansTransformer(4)
kt.fit(X_train_pre)
X_train_kmeans = kt.transform(X_train_pre)
print(X_train_pre.shape)
print(X_train_kmeans.shape)

(750, 7)
(750, 11)
(750, 7)
(750, 11)


In [13]:
def run_clustering_pipeline(X_train, y_train, X_test, y_test, smodel, umodel, best_params, preprocessor):
    fscores,sscores = [],[]
    for n_clusters in np.arange(3, 8):
        
        if smodel=="rf":
            clf = RandomForestClassifier(n_estimators=best_params['rf__n_estimators'], criterion=best_params['rf__criterion'], max_depth=best_params['rf__max_depth'])
        elif smodel=="log":
            clf = LogisticRegression(C=best_params['log__C'], penalty=best_params["log__penalty"])
        elif smodel=="svm":
            clf = SVC(C=best_params['svm__C'], gamma=best_params['svm__gamma'])
        else:
            raise Exception("invalid supervised learning model")
        
        if umodel=="kmeans":
            cluster = KmeansTransformer(k=n_clusters)
        elif umodel=="gmm":
            cluster = GmmTransformer(k=n_clusters)
        else:
            raise Exception("invalid unsupervised learning model")
            
        pipe = Pipeline(steps=[('pre', preprocessor), ('cluster', cluster), ('clf', clf)])  
        
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        
        fscore = round(f1_score(y_test, y_pred, average='binary'),3)
        sscore = pipe['cluster'].silhouette_score
        
        fscores.append(fscore)
        sscores.append(sscore)
        
    return fscores, sscores

In [14]:
cp_results = {}
smodels = ("svm","rf")
umodels = ("kmeans","gmm")

for pair in [(smodel, umodel) for smodel in smodels for umodel in umodels]:
    f, s = run_clustering_pipeline(X_train, y_train, X_test, y_test, smodel=pair[0], umodel=pair[1], best_params=best_params, preprocessor=preprocessor)
    cp_results[pair[0] + "-" + pair[1] + "-f"] = f
    cp_results[pair[0] + "-" + pair[1] + "-s"] = s
    
cp_results

{'svm-kmeans-f': [0.538, 0.538, 0.538, 0.538, 0.538],
 'svm-kmeans-s': [0.23, 0.186, 0.231, 0.297, 0.334],
 'svm-gmm-f': [0.538, 0.538, 0.538, 0.538, 0.538],
 'svm-gmm-s': [0.265, 0.279, 0.292, 0.388, 0.419],
 'rf-kmeans-f': [0.534, 0.538, 0.533, 0.548, 0.512],
 'rf-kmeans-s': [0.23, 0.186, 0.231, 0.297, 0.335],
 'rf-gmm-f': [0.542, 0.538, 0.534, 0.542, 0.538],
 'rf-gmm-s': [0.265, 0.279, 0.293, 0.369, 0.419]}

In [15]:
## display table of results
df_cp = pd.DataFrame(cp_results)
df_cp["n_clusters"] = [str(i) for i in np.arange(3, 8)]
df_cp.set_index("n_clusters", inplace=True)
df_cp.head(n=10)

Unnamed: 0_level_0,svm-kmeans-f,svm-kmeans-s,svm-gmm-f,svm-gmm-s,rf-kmeans-f,rf-kmeans-s,rf-gmm-f,rf-gmm-s
n_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,0.538,0.23,0.538,0.265,0.534,0.23,0.542,0.265
4,0.538,0.186,0.538,0.279,0.538,0.186,0.538,0.279
5,0.538,0.231,0.538,0.292,0.533,0.231,0.534,0.293
6,0.538,0.297,0.538,0.388,0.548,0.297,0.542,0.369
7,0.538,0.334,0.538,0.419,0.512,0.335,0.538,0.419


`svm-kmeans` performs at baseline while `svm-gmm` performs below. The `random forests` model potentially sees a small improvement with the addition of clusters. This is a fairly small dataset with a small number of features. The utility of adding clustering to the pipeline is generally more apparent in higher dimensional data sets.

## QUESTION 4

Run an experiment to see if you can you improve on your workflow with the addition of re-sampling techniques?

In [16]:
def run_clustering_and_resampling_pipeline(X_train, y_train, X_test, y_test, smodel, umodel, best_params, preprocessor):
    fscores,sscores = [],[]
    for n_clusters in np.arange(3, 8):
        
        if smodel=="rf":
            clf = RandomForestClassifier(n_estimators=best_params['rf__n_estimators'], criterion=best_params['rf__criterion'], max_depth=best_params['rf__max_depth'])
        elif smodel=="log":
            clf = LogisticRegression(C=best_params['log__C'], penalty=best_params["log__penalty"])
        elif smodel=="svm":
            clf = SVC(C=best_params['svm__C'], gamma=best_params['svm__gamma'])
        else:
            raise Exception("invalid supervised learning model")
        
        if umodel=="kmeans":
            cluster = KmeansTransformer(k=n_clusters)
        elif umodel=="gmm":
            cluster = GmmTransformer(k=n_clusters)
        else:
            raise Exception("invalid unsupervised learning model")
            
        pipe = pl.Pipeline(steps=[
            ('pre', preprocessor),
            ('cluster', cluster),
            ('smote', SMOTE(random_state=42)),
            ('clf', clf)])  
        
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        
        fscore = round(f1_score(y_test, y_pred, average='binary'),3)
        sscore = pipe['cluster'].silhouette_score
        
        fscores.append(fscore)
        sscores.append(sscore)
        
    return fscores, sscores

In [17]:
cp_results = {}
smodels = ("svm","rf")
umodels = ("kmeans","gmm")

for pair in [(smodel, umodel) for smodel in smodels for umodel in umodels]:
    f, s = run_clustering_and_resampling_pipeline(X_train, y_train, X_test, y_test, smodel=pair[0], umodel=pair[1], best_params=best_params, preprocessor=preprocessor)
    cp_results[pair[0] + "-" + pair[1] + "-f"] = f
    cp_results[pair[0] + "-" + pair[1] + "-s"] = s
    
cp_results

{'svm-kmeans-f': [0.609, 0.609, 0.609, 0.609, 0.609],
 'svm-kmeans-s': [0.23, 0.186, 0.231, 0.298, 0.335],
 'svm-gmm-f': [0.609, 0.609, 0.609, 0.609, 0.609],
 'svm-gmm-s': [0.265, 0.347, 0.343, 0.369, 0.422],
 'rf-kmeans-f': [0.609, 0.609, 0.6, 0.597, 0.609],
 'rf-kmeans-s': [0.229, 0.186, 0.231, 0.298, 0.334],
 'rf-gmm-f': [0.609, 0.613, 0.609, 0.609, 0.609],
 'rf-gmm-s': [0.265, 0.291, 0.352, 0.352, 0.393]}

In [18]:
## display table of results
df_cp = pd.DataFrame(cp_results)
df_cp["n_clusters"] = [str(i) for i in np.arange(3, 8)]
df_cp.set_index("n_clusters", inplace=True)
df_cp.head(n=10)

Unnamed: 0_level_0,svm-kmeans-f,svm-kmeans-s,svm-gmm-f,svm-gmm-s,rf-kmeans-f,rf-kmeans-s,rf-gmm-f,rf-gmm-s
n_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,0.609,0.23,0.609,0.265,0.609,0.229,0.609,0.265
4,0.609,0.186,0.609,0.347,0.609,0.186,0.613,0.291
5,0.609,0.231,0.609,0.343,0.6,0.231,0.609,0.352
6,0.609,0.298,0.609,0.369,0.597,0.298,0.609,0.352
7,0.609,0.335,0.609,0.422,0.609,0.334,0.609,0.393


## Solution Note

The inclusion of customer profiles does not significantly improve the overall model performance pipeline for either model. There may be some minor improvement depending on the random seed, but since it does not degrade model performance either it can be useful in the context of marketing. The clusters are customer profiles that are tied to predictive performance. The re-sampling does help the random forest classifiers obtain similar performance results to SVM in this case.