# Hackathon Part3.2: Modeling with oversampling

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor, BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostRegressor
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.svm import SVC, SVR 
from sklearn.metrics import mean_squared_error

### Used two approaches to do oversampling- SMOTE and RandomOverSampler

## Oversamping only on training dataset - Using SMOTE

In [2]:
# read data
%store -r data

In [3]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [4]:
# assign X and y
X = data.drop(['wage'], axis=1)
y = data['wage']

In [5]:
# Scale it before train test split
ss = StandardScaler()
ss.fit_transform(X)

array([[ 1.2123091 ,  1.20253518, -0.2286403 , ..., -0.04311306,
        -0.12274328, -0.53707189],
       [-0.70187063, -0.83683307, -0.2286403 , ..., -0.04311306,
        -0.12274328, -0.53707189],
       [-0.248135  ,  0.69166598,  2.8899561 , ..., -0.04311306,
        -0.12274328, -0.53707189],
       ...,
       [-0.42078982,  1.35656997,  5.86576557, ..., -0.04311306,
        -0.12274328,  1.86194813],
       [-0.0856424 ,  1.37427656, -0.2286403 , ..., -0.04311306,
        -0.12274328, -0.53707189],
       [ 0.28311491,  0.50440585, -0.2286403 , ..., -0.04311306,
        -0.12274328, -0.53707189]])

In [6]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  random_state=42)

# oversampling training set using SMOTE
sm = SMOTE(random_state=42, 
           sampling_strategy= 1.0) # ratio between minority group and majority group
X_train_os, y_train_os = sm.fit_sample(X_train, y_train)

In [7]:
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [8]:
pipe_params = [
    {
        'model__estimator': [LogisticRegression()],
        'model__estimator__penalty': ['l1', 'l2']
    },
    {
        'model__estimator': [RandomForestClassifier()],
        'model__estimator__min_samples_split': [2, 3, 4, 5],
        'model__estimator__min_samples_leaf' : [13, 14, 15, 16, 17],
    },
#      {
#         'model__estimator': [DecisionTreeClassifier()],
#         'model__estimator__min_samples_split': [2, 3, 4],
#         'model__estimator__min_samples_leaf' : [1, 2, 3, 4],
#         'model__estimator__max_features' : ['auto' , 'sqrt'],
#     },
    {
        'model__estimator': [AdaBoostClassifier()],
        'model__estimator__n_estimators': [100, 130, 150, 180],
        'model__estimator__learning_rate' : [1, 2],
        'model__estimator__algorithm' : ['SAMME', 'SAMME.R']
    },
#      {
#         'model__estimator': [KNeighborsClassifier()],
#         'model__estimator__p': [1, 2],
#         'model__estimator__leaf_size' : [30, 20, 10],
#     },
#      {
#         'model__estimator': [BaggingClassifier()],
#         'model__estimator__n_estimators': [50, 100, 200, 300],
#         'model__estimator__bootstrap' : [True, False]
#      },
#      {
#         'model__estimator': [SVC()],
#         'model__estimator__degree' : [2, 3, 4],
#         'model__estimator__C' : [1, 10, 20, 100],

#      },
#          {
#         'model__estimator': [GaussianNB()]
#      },

]

# Defining a function to do our model analysis. This function takes in X, y, and any pipe parameters
def model_analysis(X, y):
    pipe = Pipeline([
            ('model', ClfSwitcher())])

    gs = GridSearchCV(pipe, pipe_params, cv=3, verbose=3, n_jobs=4)
    gs.fit(X_train_os, y_train_os)

    print(f' Best Parameters: {gs.best_params_}')
    print('')
    print(f' Cross Validation Accuracy Score: {gs.best_score_}')
    print(f' Training Data Accuracy Score: {gs.score(X_train_os, y_train_os)}')
    print(f' Testing Data Accuracy Score: {gs.score(X_test, y_test)}')

In [9]:
model_analysis(X_train_os, y_train_os)

Fitting 3 folds for each of 38 candidates, totalling 114 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 107 out of 114 | elapsed:   12.9s remaining:    0.8s
[Parallel(n_jobs=4)]: Done 114 out of 114 | elapsed:   14.9s finished


 Best Parameters: {'model__estimator': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=180, random_state=None), 'model__estimator__algorithm': 'SAMME.R', 'model__estimator__learning_rate': 1, 'model__estimator__n_estimators': 180}

 Cross Validation Accuracy Score: 0.8082491122644087
 Training Data Accuracy Score: 0.8931985796230538
 Testing Data Accuracy Score: 0.852195423623995


In [10]:
# No better than .86 that we got from the original dataset.

# Oversampling using Random Sampling

In [11]:
#try Random Oversampling
#https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis
# We already have X and y

In [12]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  random_state=42)

# oversampling training set using SMOTE
sm = RandomOverSampler(random_state=42, 
           sampling_strategy= 0.8) # ratio between minority group and majority group
X_train_ros, y_train_ros = sm.fit_sample(X_train, y_train)

In [13]:
pipe_params = [
    {
        'model__estimator': [LogisticRegression()],
        'model__estimator__penalty': ['l1', 'l2']
    },
    {
        'model__estimator': [RandomForestClassifier()],
        'model__estimator__min_samples_split': [2, 3, 4, 5],
        'model__estimator__min_samples_leaf' : [13, 14, 15, 16, 17],
    },
#      {
#         'model__estimator': [DecisionTreeClassifier()],
#         'model__estimator__min_samples_split': [2, 3, 4],
#         'model__estimator__min_samples_leaf' : [1, 2, 3, 4],
#         'model__estimator__max_features' : ['auto' , 'sqrt'],
#     },
    {
        'model__estimator': [AdaBoostClassifier()],
        'model__estimator__n_estimators': [100, 130, 150, 180],
        'model__estimator__learning_rate' : [1, 2],
        'model__estimator__algorithm' : ['SAMME', 'SAMME.R']
    },
     {
        'model__estimator': [KNeighborsClassifier()],
        'model__estimator__p': [1, 2],
        'model__estimator__leaf_size' : [30, 20, 10],
    },
#      {
#         'model__estimator': [BaggingClassifier()],
#         'model__estimator__n_estimators': [50, 100, 200, 300],
#         'model__estimator__bootstrap' : [True, False]
#      },
     {
        'model__estimator': [SVC()],
        'model__estimator__degree' : [2, 3, 4],
        'model__estimator__C' : [1, 10, 20, 100],

     },
         {
        'model__estimator': [GaussianNB()]
     },

]

# Defining a function to do our model analysis. This function takes in X, y, and any pipe parameters
def model_analysis(X, y):
    pipe = Pipeline([
            ('model', ClfSwitcher())])

    gs = GridSearchCV(pipe, pipe_params, cv=3, verbose=3, n_jobs=4)
    gs.fit(X_train_os, y_train_os)

    print(f' Best Parameters: {gs.best_params_}')
    print('')
    print(f' Cross Validation Accuracy Score: {gs.best_score_}')
    print(f' Training Data Accuracy Score: {gs.score(X_train_ros, y_train_ros)}')
    print(f' Testing Data Accuracy Score: {gs.score(X_test, y_test)}')

In [14]:
model_analysis(X_train_ros, y_train_ros)

Fitting 3 folds for each of 57 candidates, totalling 171 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  72 tasks      | elapsed:    4.4s


 Best Parameters: {'model__estimator': GaussianNB(priors=None, var_smoothing=1e-09)}

 Cross Validation Accuracy Score: 0.8131658016935264
 Training Data Accuracy Score: 0.7942024586431932
 Testing Data Accuracy Score: 0.7748917748917749


[Parallel(n_jobs=4)]: Done 171 out of 171 | elapsed:   35.4s finished


In [15]:
# Seen Overfitness