In [1]:
import nb_utils
proj_dir = nb_utils.proj_path_setup()

In [36]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing   import StandardScaler
from sklearn.linear_model    import LogisticRegression
from sklearn.linear_model    import LinearRegression
from sklearn.tree            import DecisionTreeClassifier
from sklearn.ensemble        import RandomForestClassifier
from sklearn.svm             import SVC
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.naive_bayes     import GaussianNB
from sklearn.ensemble        import GradientBoostingClassifier
from xgboost                 import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline        import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble        import StackingClassifier

import os

plt.style.use('ggplot')

%matplotlib inline

In [3]:
train_df = pd.read_csv(os.path.join(proj_dir, 'data', 'raw', 'train.csv'))
test_df = pd.read_csv(os.path.join(proj_dir, 'data', 'raw', 'test.csv'))
feature_cols = [col for col in test_df.columns if col != 'id']

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       414 non-null    int64  
 1   gravity  414 non-null    float64
 2   ph       414 non-null    float64
 3   osmo     414 non-null    int64  
 4   cond     414 non-null    float64
 5   urea     414 non-null    int64  
 6   calc     414 non-null    float64
 7   target   414 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 26.0 KB


In [5]:
X = train_df[feature_cols]
y = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Create a pipeline with a list of steps
LR_model = Pipeline([('scaler', StandardScaler()),('classifier', LogisticRegression())])
# Fit the pipeline to the data
LR_model.fit(X_train, y_train)
# Predict on test data
y_pred = LR_model.predict(X_test)
# Evaluate the pipeline
accuracy = LR_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.78


In [7]:
# Create a pipeline with a list of steps
DT_model = Pipeline([('scaler', StandardScaler()),('classifier', DecisionTreeClassifier())])
# Fit the pipeline to the data
DT_model.fit(X_train, y_train)
# Predict on test data
y_pred = DT_model.predict(X_test)
# Evaluate the pipeline
accuracy = DT_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.67


In [8]:
# Create a pipeline with a list of steps
RF_model = Pipeline([('scaler', StandardScaler()),('classifier', RandomForestClassifier())])
# Fit the pipeline to the data
RF_model.fit(X_train, y_train)
# Predict on test data
y_pred = RF_model.predict(X_test)
# Evaluate the pipeline
accuracy = RF_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.70


In [9]:
# Create a pipeline with a list of steps
SV_model = Pipeline([('scaler', StandardScaler()),('classifier', SVC())])
# Fit the pipeline to the data
SV_model.fit(X_train, y_train)
# Predict on test data
y_pred = SV_model.predict(X_test)
# Evaluate the pipeline
accuracy = SV_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.77


In [10]:
# Create a pipeline with a list of steps
KN_model = Pipeline([('scaler', StandardScaler()),('classifier', KNeighborsClassifier())])
# Fit the pipeline to the data
KN_model.fit(X_train, y_train)
# Predict on test data
y_pred = KN_model.predict(X_test)
# Evaluate the pipeline
accuracy = KN_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.75


In [11]:
# Create a pipeline with a list of steps
GN_model = Pipeline([('scaler', StandardScaler()),('classifier', GaussianNB())])
# Fit the pipeline to the data
GN_model.fit(X_train, y_train)
# Predict on test data
y_pred = GN_model.predict(X_test)
# Evaluate the pipeline
accuracy = GN_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.81


In [12]:
# Create a pipeline with a list of steps
GB_model = Pipeline([('scaler', StandardScaler()),('classifier', GradientBoostingClassifier())])
# Fit the pipeline to the data
GB_model.fit(X_train, y_train)
# Predict on test data
y_pred = GB_model.predict(X_test)
# Evaluate the pipeline
accuracy = GB_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.80


In [13]:
# Create a pipeline with a list of steps
XGB_model = Pipeline([('scaler', StandardScaler()),('classifier',XGBClassifier())])
# Fit the pipeline to the data
XGB_model.fit(X_train, y_train)
# Predict on test data
y_pred = XGB_model.predict(X_test)
# Evaluate the pipeline
accuracy = XGB_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.66


#### Collect predicted probabilities for each model

In [14]:
model_prefixes = ['LR', 'DT', 'RF', 'KN', 'GN', 'GB', 'XGB']
models = [LR_model, DT_model, RF_model, KN_model, GN_model, GB_model, XGB_model]

predicted_probs_dict = {prefix : model.predict_proba(X_train)[:,1] for prefix, model in zip(model_prefixes, models)}
test_predicted_probs_dict = {prefix : model.predict_proba(X_test)[:,1] for prefix, model in zip(model_prefixes, models)}

In [15]:
prob_df = pd.DataFrame(predicted_probs_dict)
prob_df['SV'] = SV_model.predict(X_train)
prob_df

Unnamed: 0,LR,DT,RF,KN,GN,GB,XGB,SV
0,0.219899,0.0,0.10,0.0,0.090252,0.067690,0.013862,0
1,0.382587,0.0,0.15,0.4,0.515520,0.163754,0.011069,0
2,0.435131,1.0,0.83,0.4,0.661918,0.821618,0.953017,1
3,0.566789,1.0,0.85,0.8,0.607626,0.871015,0.982519,1
4,0.937031,1.0,0.90,1.0,0.997234,0.785901,0.961263,1
...,...,...,...,...,...,...,...,...
326,0.585463,0.0,0.22,0.6,0.740104,0.156978,0.059356,1
327,0.277627,0.0,0.08,0.0,0.453698,0.116387,0.006312,0
328,0.349885,0.0,0.07,0.0,0.124925,0.063801,0.000875,0
329,0.473740,1.0,0.91,1.0,0.775227,0.748336,0.977285,1


In [16]:
test_prob_df = pd.DataFrame(test_predicted_probs_dict)
test_prob_df['SV'] = SV_model.predict(X_test)
test_prob_df

Unnamed: 0,LR,DT,RF,KN,GN,GB,XGB,SV
0,0.833748,1.0,0.98,1.0,0.980084,0.945488,0.990660,1
1,0.640019,0.0,0.75,0.8,0.902917,0.877084,0.994683,1
2,0.361860,0.0,0.40,0.2,0.668281,0.456030,0.781455,0
3,0.560934,0.0,0.28,0.8,0.459810,0.519307,0.070061,1
4,0.343437,1.0,0.42,0.6,0.044674,0.627139,0.762264,0
...,...,...,...,...,...,...,...,...
78,0.205671,0.0,0.44,0.4,0.110098,0.262727,0.669616,0
79,0.779387,1.0,0.68,0.8,0.878693,0.718576,0.785374,1
80,0.309300,0.0,0.33,0.4,0.260223,0.280937,0.006067,0
81,0.422584,0.0,0.31,0.2,0.544809,0.187563,0.014169,0


#### Use Logistic Model to Predict on the Output from the Primary Models

In [17]:
final_model = LogisticRegression()
final_model.fit(prob_df, y_train)
accuracy = final_model.score(test_prob_df, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.73


#### Try a voting classifier

In [18]:
from sklearn.ensemble import VotingClassifier

In [19]:
estimators = [(prefix, model) for prefix, model in zip(model_prefixes, models)]
estimators.append(('SV', SV_model))

In [20]:
final_model = VotingClassifier(
    estimators=estimators, voting='hard'
)
final_model.fit(X_train, y_train)
accuracy = final_model.score(X_test, y_test)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.75


#### Tuning Individual Model Parameters with GridSearch

In [38]:
scaled_X_train = StandardScaler().fit_transform(X_train)

In [22]:
param_grid = {
    'max_iter': [2000],
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr = LogisticRegression(random_state=0)
clf_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_lr = clf_lr.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_lr.best_params_)
print('Best score:', best_clf_lr.best_score_)

Best parameters: {'C': 0.1, 'max_iter': 2000, 'penalty': 'l1', 'solver': 'liblinear'}
Best score: 0.7752868281815649


In [23]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}
knn = KNeighborsClassifier()
clf_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_knn= clf_knn.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_knn.best_params_)
print('Best score:', best_clf_knn.best_score_)

Best parameters: {'algorithm': 'auto', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Best score: 0.7571358952937901


In [24]:
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

gnb = GaussianNB()
clf_gnb = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_gnb = clf_gnb.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_gnb.best_params_)
print('Best score:', best_clf_gnb.best_score_)

Best parameters: {'var_smoothing': 1e-09}
Best score: 0.7356307435254804


In [25]:
param_grid = {
    'n_estimators': [10, 30, 50, 100, 200, 500, 700, 100, 1200],
    'max_depth': [None, 3, 5, 10, 20, 50, 75, 100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=0)
clf_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_rf = clf_rf.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_rf.best_params_)
print('Best score:', best_clf_rf.best_score_)

Best parameters: {'n_estimators': 700, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 3, 'bootstrap': True}
Best score: 0.7794792536897801


In [26]:
param_grid = {
    'n_estimators': [450,500,550],
    'colsample_bytree': [0.75,0.8,0.85],
    'max_depth': [None],
    'reg_alpha': [1],
    'reg_lambda': [2, 5, 10],
    'subsample': [0.55, 0.6, .65],
    'learning_rate':[0.5],
    'gamma':[.5,1,2],
    'min_child_weight':[0.01],
    'sampling_method': ['uniform']
}

xgb = XGBClassifier(random_state=0)
clf_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_xgb = clf_xgb.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_xgb.best_params_)
print('Best score:', best_clf_xgb.best_score_)

Best parameters: {'subsample': 0.55, 'sampling_method': 'uniform', 'reg_lambda': 10, 'reg_alpha': 1, 'n_estimators': 550, 'min_child_weight': 0.01, 'max_depth': None, 'learning_rate': 0.5, 'gamma': 2, 'colsample_bytree': 0.8}
Best score: 0.7734948482316903


In [27]:
param_grid = [
   {'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10], 'C': [.1, 1, 10, 100]},
    {'kernel': ['linear'], 'C': [.1, 1, 10, 100]},
   {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100]}
]

svc = SVC(probability=True)
clf_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=10, n_jobs=-1, scoring='roc_auc')
best_clf_svc = clf_svc.fit(scaled_X_train, y_train)

# Print the best parameters and score
print('Best parameters:', best_clf_svc.best_params_)
print('Best score:', best_clf_svc.best_score_)

Best parameters: {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf'}
Best score: 0.7790531885268728


In [28]:
voting_clf = VotingClassifier(
    estimators=[('lr', best_clf_lr), ('gnb', best_clf_gnb), ('knn', best_clf_knn), ('rf', best_clf_rf), ('svc', best_clf_svc), ('xgb', best_clf_xgb)], 
    voting='soft'
)

cv = cross_val_score(voting_clf, scaled_X_train, y_train, cv=10, n_jobs=-1, scoring='roc_auc')
print(cv)
print(cv.mean())

[0.81754386 0.81578947 0.68045113 0.72180451 0.72180451 0.79259259
 0.87777778 0.84814815 0.71481481 0.75555556]
0.7746282372598162


In [31]:
X_test_kaggle = StandardScaler().fit_transform(test_df[feature_cols])
X_test_kaggle_df = pd.DataFrame(X_test_kaggle, columns=feature_cols)

In [35]:
voting_clf.fit(scaled_X_train, y_train)
preds = voting_clf.predict(X_test_kaggle)

base_params_model_submission = pd.DataFrame({'id': test_df['id'], 'target': preds})
base_params_model_submission.to_csv(os.path.join(proj_dir, 'data', 'submissions', 'voting_clf_best_params.csv'), index=False)

#### StackingClassifier with Above Tuned Models

In [45]:
models = [('lr', best_clf_lr), ('gnb', best_clf_gnb), ('knn', best_clf_knn), ('rf', best_clf_rf), ('svc', best_clf_svc), ('xgb', best_clf_xgb)]
stacking_clf = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

cv = cross_val_score(stacking_clf, scaled_X_train, y_train, cv=10, n_jobs=-1, scoring='roc_auc')
print(cv)
print(cv.mean())

[0.81052632 0.81203008 0.70300752 0.70676692 0.73308271 0.78518519
 0.87407407 0.84444444 0.6962963  0.75925926]
0.7724672793093845


In [44]:
stacking_clf.fit(scaled_X_train, y_train)
preds = stacking_clf.predict(X_test_kaggle)

base_params_model_submission = pd.DataFrame({'id': test_df['id'], 'target': preds})
base_params_model_submission.to_csv(os.path.join(proj_dir, 'data', 'submissions', 'stacking_clf_best_params.csv'), index=False)