In [1]:
# Import our libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC

import check_file as ch

%matplotlib inline

# Read in our dataset
diabetes = pd.read_csv('diabetes.csv')

# Take a look at the first few rows of the dataset
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
y = diabetes["Outcome"]
X = diabetes.drop("Outcome", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# build a classifier
clf_rf = RandomForestClassifier()

# Set up the hyperparameter search
param_dist = {"max_depth": [3, None],
              "n_estimators": list(range(10, 200)),
              "max_features": list(range(1, X_test.shape[1]+1)),
              "min_samples_split": list(range(2, 11)),
              "min_samples_leaf": list(range(1, 11)),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}


# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
rf_preds = random_search.best_estimator_.predict(X_test)

ch.print_metrics(y_test, rf_preds, 'random forest')



Accuracy score for random forest : 0.7402597402597403
Precision score random forest : 0.6271186440677966
Recall score random forest : 0.6727272727272727
F1 score random forest : 0.6491228070175439





In [13]:
# build a classifier for ada boost
from sklearn.ensemble import AdaBoostClassifier

ada_model = AdaBoostClassifier()

# Set up the hyperparameter search
# look at  setting up your search for n_estimators, learning_rate
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
param_dist = {"n_estimators": list(range(30, 150, 2)),
              "learning_rate": np.arange(.1, 10, .1)}


# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(ada_model, param_distributions=param_dist)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
ada_preds = random_search.best_estimator_.predict(X_test)

# Return your metrics on test data
ch.print_metrics(y_test, ada_preds, 'adaboost')



Accuracy score for adaboost : 0.7532467532467533
Precision score adaboost : 0.6491228070175439
Recall score adaboost : 0.6727272727272727
F1 score adaboost : 0.6607142857142858







In [5]:
# build a classifier for support vector machines
from sklearn.svm import SVC

# Set up the hyperparameter search
# look at setting up your search for C (recommend 0-10 range), 
# kernel, and degree
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svc_model = SVC()

param_dist = {"C": list(range(1, 11, 1)),
              "kernel": ["linear", "rbf"] }

# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(svc_model, param_distributions=param_dist)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
svc_preds = random_search.best_estimator_.predict(X_test)


# Return your metrics on test data
ch.print_metrics(y_test, svc_preds, 'svc')



Accuracy score for svc : 0.7532467532467533
Precision score svc : 0.6545454545454545
Recall score svc : 0.6545454545454545
F1 score svc : 0.6545454545454545





In [None]:
# build a classifier for support vector machines
from sklearn.svm import SVC

# Set up the hyperparameter search
# look at setting up your search for C (recommend 0-10 range), 
# kernel, and degree
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
svc_model = SVC()

param_dist = {"C": list(range(1, 2, 1)),
              "kernel": ["poly"],
              "degree": list(range(2, 6, 1))}

# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(svc_model, param_distributions=param_dist, n_jobs=4)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
svc_preds = random_search.best_estimator_.predict(X_test)


# Return your metrics on test data
ch.print_metrics(y_test, svc_preds, 'svc')

