# Make this notebook run in IBM Watson

In [1]:
# The code was removed by Watson Studio for sharing.

In [2]:
# START CODE BLOCK
# cos2file - takes an object from Cloud Object Storage and writes it to file on container file system.
# Uses the IBM project_lib library.
# See https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/project-lib-python.html
# Arguments:
# p: project object defined in project token
# data_path: the directory to write the file
# filename: name of the file in COS

import os
def cos2file(p,data_path,filename):
    data_dir = p.project_context.home + data_path
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    open( data_dir + '/' + filename, 'wb').write(p.get_file(filename).read())

# file2cos - takes file on container file system and writes it to an object in Cloud Object Storage.
# Uses the IBM project_lib library.
# See https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/project-lib-python.html
# Arguments:
# p: prooject object defined in project token
# data_path: the directory to read the file from
# filename: name of the file on container file system

import os
def file2cos(p,data_path,filename):
    data_dir = p.project_context.home + data_path
    path_to_file = data_dir + '/' + filename
    if os.path.exists(path_to_file):
        file_object = open(path_to_file, 'rb')
        p.save_data(filename, file_object, set_project_asset=True, overwrite=True)
    else:
        print("file2cos error: File not found")
# END CODE BLOCK

In [3]:
cos2file(project, '/data', 'aavail-target.csv')

# Setup

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Pipelines

There are an incredible amount of possible workflows for any given data set when we account for transforms, feature engineering, model selection and model tuning. This means that we need a systematic way to compare these workflow variants. This is where pipelines become so useful and it is the consistency of the three interfaces that allow make pipelines like this one a necessary part of the iterative workflow.

In [5]:
## load the boston dataset
boston = load_boston()
X, y = boston['data'], boston['target']
features = boston['feature_names']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe = Pipeline([("scaler", StandardScaler()),
                 ("featsel", SelectKBest(k=10)),
                 ("rf",RandomForestRegressor(n_estimators=20))])

## train the data
pipe.fit(X_train,y_train)

## evaluate the model
y_pred = pipe.predict(X_test)
print(r'R^2=%.2f, MAE=%.2f'%(r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))

R^2=0.74, MAE=1.46


Here we are standardizing the data then selecting the 10 best features according to an ANOVA test. These transformed data are then piped into a random forest regression model. See the SelectKBest class to see the other options that are available as a scoring function. It is worth mentioning the three scikit-learn interfaces in combination with pipelines have had such an impact on the data science workflow that Apache Spark now has similar ML pipelines.

# Class Imbalance

In [6]:
data_dir = os.path.join("..", "data")
df = pd.read_csv(os.path.join(data_dir, r"aavail-target.csv"))
df.head()

Unnamed: 0,customer_id,is_subscriber,country,age,customer_name,subscriber_type,num_streams
0,1,1,united_states,21,Kasen Todd,aavail_premium,23
1,2,0,singapore,30,Ensley Garza,aavail_unlimited,12
2,3,0,united_states,21,Lillian Carey,aavail_premium,22
3,4,1,united_states,20,Beau Christensen,aavail_basic,19
4,5,1,singapore,21,Ernesto Gibson,aavail_premium,23


In [7]:
## pull out the target and remove uneeded columns
_y = df.pop("is_subscriber")

## switch churn to be the minority class
y = np.zeros(_y.size)
y[_y==0] = 1
df.drop(columns=["customer_id", "customer_name"], inplace=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, stratify=y)
print(sorted(Counter(y_train).items()))
print(sorted(Counter(y_test).items()))

[(0.0, 569), (1.0, 231)]
[(0.0, 142), (1.0, 58)]


In [9]:
## transformation pipeline
numeric_features = ["age", "num_streams"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_features = ["country", "subscriber_type"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

## model pipeline
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(solver="lbfgs"))
])

In [10]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["subscriber", "churn"]))

              precision    recall  f1-score   support

  subscriber       0.86      0.84      0.85       142
       churn       0.63      0.67      0.65        58

    accuracy                           0.79       200
   macro avg       0.75      0.76      0.75       200
weighted avg       0.79      0.79      0.79       200



### Imbalanced Learn

In [11]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /opt/conda/envs/Python36/lib/python3.6/site-packages (0.6.2)


In [12]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

Using TensorFlow backend.


In [13]:
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2, 
                           n_redundant=0, n_repeated=0, n_classes=3, 
                           n_clusters_per_class=1, 
                           weights=[0.01, 0.05, 0.94], 
                           class_sep=0.8, random_state=0)

print("Original Target")
print(sorted(Counter(y).items()))

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("\nResampled Target")
print(sorted(Counter(y_resampled).items()))

Original Target
[(0, 64), (1, 262), (2, 4674)]

Resampled Target
[(0, 4674), (1, 4674), (2, 4674)]


Compare different methods for imbalancing

In [14]:
import imblearn.pipeline as pl
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

## Pipelines
clf1 = pl.Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(solver="lbfgs"))
])

clf2 = pl.Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("ros", RandomOverSampler(random_state=42)),
    ("classifier", LogisticRegression(solver="lbfgs"))
    
])

clf3 = pl.Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", LogisticRegression(solver="lbfgs"))
    
])

for clf in [clf1, clf2, clf3]:
    clf.fit(X_train, y_train)

In [15]:
for name, clf in [("no sampling", clf1), ("random oversampling", clf2), ("smote", clf3)]:
    y_pred = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, y_pred, target_names=["subscriber", "churn"]))

no sampling
              precision    recall  f1-score   support

  subscriber       0.86      0.84      0.85       142
       churn       0.63      0.67      0.65        58

    accuracy                           0.79       200
   macro avg       0.75      0.76      0.75       200
weighted avg       0.79      0.79      0.79       200

random oversampling
              precision    recall  f1-score   support

  subscriber       0.87      0.82      0.84       142
       churn       0.62      0.69      0.65        58

    accuracy                           0.79       200
   macro avg       0.74      0.76      0.75       200
weighted avg       0.79      0.79      0.79       200

smote
              precision    recall  f1-score   support

  subscriber       0.87      0.82      0.84       142
       churn       0.62      0.69      0.65        58

    accuracy                           0.79       200
   macro avg       0.74      0.76      0.75       200
weighted avg       0.79      0.79   