# CASE STUDY - unsupervised learning


In [1]:
import os
import joblib
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import BayesianGaussianMixture
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import imblearn.pipeline as pl
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SVMSMOTE
    
plt.style.use('seaborn')
%matplotlib inline

## Synopsis

  > We are now going to predict customer retention.  There are many models and many transforms to consider.  Use your
    knowledge of pipelines and functions to ensure that your code makes it easy to compare and iterate.  
    
  > Marketing has asked you to make a report on customer retention.  They would like you to come up with information     that can be used to improve current marketing strategy efforts.  The current plan is for marketing at AAVAIL to
    collect more features on subscribers the and they would like to use your report as a proof-of-concept in order to     get buyin for this effort.
  
## Outline

1. Create a churn prediction baseline model
2. Use clustering as part of your prediction pipeline
3. 
4. Run and experiment to see if re-sampling techniques improve your model

## Data

Here we load the data as we have already done.

`aavail-target.csv`

In [2]:
def check_missing_data(df):
    md = dict()
    md['isnan'] = df.isna().sum().sum()
    md['isnull'] = df.isnull().sum().sum()
    md['repeated'] = df.duplicated().sum()
    return md

In [3]:
df = pd.read_csv("aavail-target.csv")

## pull out the target and remove uneeded columns
_y = df.pop('is_subscriber')
y = np.zeros(_y.size)
y[_y==0] = 1 
df.drop(columns=['customer_id','customer_name'],inplace=True)
check_missing_data(df)

{'isnan': 0, 'isnull': 0, 'repeated': 262}

### QUESTION 1

Create a stratified train test split of the data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.25, stratify = y)

In [5]:
X_train

Unnamed: 0,country,age,subscriber_type,num_streams
597,united_states,36,aavail_unlimited,25
279,united_states,35,aavail_unlimited,14
252,united_states,20,aavail_basic,15
599,singapore,19,aavail_unlimited,11
298,united_states,24,aavail_premium,20
...,...,...,...,...
689,singapore,40,aavail_premium,13
786,united_states,46,aavail_basic,16
531,united_states,23,aavail_premium,19
964,united_states,37,aavail_basic,20


### QUESTION 2

Create a baseline model.  We are going to test whether clustering followed by a model improves the results.  The we will test whether re-sampling techniques provide improvements.  Use a pipeline or another method, but create a baseline model given the data. Here is the ColumnTransformer we have used before.

In [71]:
one_hot_cols = ['country', 'subscriber_type']
scale_cols = ['age', 'num_streams']

one_hot = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder(handle_unknown = 'ignore'))
])

scaler = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('std_scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('one_hot_pipeline', one_hot, one_hot_cols),
    ('scaler_pipeline', scaler, scale_cols)
])

In [72]:
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.fit_transform(X_test)

In [73]:
X_train_pre

array([[ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.8799322 ,  1.50628234],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.80257325, -0.77136233],
       [ 0.        ,  1.        ,  1.        , ...,  0.        ,
        -0.3578109 , -0.56430372],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -0.12573407,  0.2639307 ],
       [ 0.        ,  1.        ,  1.        , ...,  0.        ,
         0.95729114,  0.47098931],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.10634276,  0.88510652]])

In [74]:
print(X_test_pre[0])

[ 0.          1.          1.          0.          0.         -0.80012161
 -0.98141587]


### QUESTION 3

The next part is to create version of the classifier that uses identified clusters.  Here is a class to get you started.  It is a transformer like those that we have been working with.  There is an example of how to use it just below.  In this example 4 clusters were specified and their one-hot encoded versions were appended to the feature matrix.  Now using pipelines and/or functions compare the performance using cluster profiling as part of your matrix to the baseline.  You may compare multiple models and multiple clustering algorithms here.

In [90]:
class LogisticClf():
    def __init__(self, penalty = 'l2', solver = 'liblinear', C = 1.0):
        self.log_model = LogisticRegression(penalty = penalty, solver = solver, C = C)
        self.score = None
        self.predictions = None
    def fit(self, X, y):
        self.log_model.fit(X, y)
    def predict(self, X):
        self.log_model = self.log_model.predict(X)

class KmeansClf():
    def __init__(self, n_clusters = 2, n_init = 10):
        self.kmeans = KMeans(n_clusters = n_clusters, n_init = n_init)
        self.predictions = None
    def fit_transform(self, X, y):
        self.kmeans.fit_transform(X, y)
    def pred(self, X):
        self.predictions = self.kmeans.predict(X)
        return self.predictions
        
class RandomForestClf():
    def __init__(self, max_depth = None, random_state = 0):
        self.random_forest = RandomForestClassifier(max_depth = max_depth, random_state = random_state)
        self.predictions = None
    def fit(self, X, y):
        self.random_forest.fit(X, y)
    def pred(self, X):
        pred = self.random_forest.predict(X)
        self.predictions = pred
        return pred
    
class SpectralClusteringClf():
    def __init__(self, n_clusters = 2, random_state = 0):
        self.spectral_clustering = SpectralClustering(n_clusters = n_clusters, random_state = random_state)
        self.predictions = None
    def fit(self, X, y):
        self.spectral_clustering.fit(X, y)
    def pred(self, X):
        pred = self.spectral_clustering.predict(X)
        self.predictions = pred
        return pred
    
class BayesGaussianMixClf():
    def __init__(self):
        self.bayes_gauss = BayesianGaussianMixture()
        self.predictions = None
    def fit(self, X, y):
        self.bayes_gauss.fit(X, y)
    def pred(self, X):
        pred = self.bayes_gauss.predict(X)
        self.predictions = pred
        return pred

def compare_all_methods(X, y, X_test, y_test):
    kmeans = KMeans(n_clusters = 2, n_init = 10)
    kmeans.fit_transform(X, y)
    rand_forest = RandomForestClassifier(random_state = 0)
    rand_forest.fit(X, y)
    bayes_gauss = BayesianGaussianMixture()
    bayes_gauss.fit(X, y)
    log_reg = LogisticRegression()
    log_reg.fit(X, y)
    predictions = {
        'kmeans': kmeans.predict(X_test),
#         'kmeans_model': kmeans,
        'rand_forest': rand_forest.predict(X_test),
#         'rand_forest_model': rand_forest,
#         'spect_clust': spect_clust.predict(X_test),
#         'spect_clust_model': spect_clust,
        'bayes_gauss': bayes_gauss.predict(X_test),
#         'bayes_gauss_model': bayes_gauss,
        'log_reg': log_reg.predict(X_test),
#         'log_reg_model': log_reg
    }
    return predictions

In [91]:
predictions = compare_all_methods(X_train_pre, y_train, X_test_pre, y_test)

In [92]:
print(classification_report(y_test, predictions['kmeans']))

              precision    recall  f1-score   support

         0.0       0.76      0.75      0.76       178
         1.0       0.41      0.43      0.42        72

    accuracy                           0.66       250
   macro avg       0.59      0.59      0.59       250
weighted avg       0.66      0.66      0.66       250



In [93]:
print(classification_report(y_test, predictions['rand_forest']))

              precision    recall  f1-score   support

         0.0       0.79      0.84      0.81       178
         1.0       0.53      0.46      0.49        72

    accuracy                           0.73       250
   macro avg       0.66      0.65      0.65       250
weighted avg       0.72      0.73      0.72       250



In [94]:
print(classification_report(y_test, predictions['bayes_gauss']))

              precision    recall  f1-score   support

         0.0       0.71      1.00      0.83       178
         1.0       0.00      0.00      0.00        72

    accuracy                           0.71       250
   macro avg       0.36      0.50      0.42       250
weighted avg       0.51      0.71      0.59       250



  _warn_prf(average, modifier, msg_start, len(result))


## QUESTION 4

Run an experiment to see if you can you improve on your workflow with the addition of re-sampling techniques?

In [101]:
X_re, y_re = SMOTE().fit_sample(X_train_pre, y_train)
predictions = compare_all_modelx(X_re, y_train, X_test, y_test)

AttributeError: 'SMOTE' object has no attribute '_validate_data'