In [None]:
#%pip install palmerpenguins #https://github.com/mcnakhaee/palmerpenguins

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from palmerpenguins import load_penguins
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [None]:
penguins = load_penguins()
type(penguins)

## Data Cleaning

In [None]:
penguins['chinstrap_bool'] = np.where(penguins['species'] == 'Chinstrap', 1, 0)
penguins.head()

In [None]:
# remember to remove missing values before a machine learning algorithm
penguins_subset = penguins[['chinstrap_bool', 'flipper_length_mm', 'bill_length_mm']].dropna()
penguins_subset.head()

# Logistic Regression

## One predictor variable

In [None]:
X = penguins_subset[['flipper_length_mm']] #explanatory variable
y = penguins_subset['chinstrap_bool'] #response variable (Boolean)

In [None]:
#main code for logistic regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=301)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = logreg.predict(X_test)

## Confusion Matrix

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)

In [None]:
# metrics from a confusion matrix
print(classification_report(y_test, y_pred))

## Preview: ROC Curves

ROC curves (receiver operating characteristic) are very popular for helping judge the quality of a classification computation.  

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Two predictor variables

In [None]:
X = penguins_subset[['flipper_length_mm', 'bill_length_mm']] #explanatory variables
y = penguins_subset['chinstrap_bool'] #response variable (Boolean)

In [None]:
#main code for logistic regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=301)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = logreg.predict(X_test)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)

In [None]:
# metrics from a confusion matrix
print(classification_report(y_test, y_pred))

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Naive Bayes

## One categorical predictor

In [None]:
mean_weight = penguins['body_mass_g'].mean()
penguins['above_average_weight'] = np.where(penguins['body_mass_g'] > mean_weight, 1, 0)
penguins_subset = penguins[['species', 'above_average_weight']].dropna()
penguins_subset.head()
X = penguins_subset[['above_average_weight']] #explanatory variable
y = penguins_subset['species'] #response variable (Boolean)

In [None]:
# main code for Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=301)
NB_model = GaussianNB()
NB_model.fit(X_train, y_train)
y_pred = NB_model.predict(X_test)

In [None]:
# make one prediction
print(NB_model.predict([[0]])) #here, "0" for "below-average weight"

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', cbar=False)

In [None]:
# metrics from a confusion matrix
print(classification_report(y_test, y_pred))

# One numerical predictor

In [None]:
penguins_subset = penguins[['species', 'bill_length_mm']].dropna()
penguins_subset.head()
X = penguins_subset[['bill_length_mm']] #explanatory variable
y = penguins_subset['species'] #response variable (Boolean)

In [None]:
# main code for Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=301)
NB_model = GaussianNB()
NB_model.fit(X_train, y_train)
y_pred = NB_model.predict(X_test)

In [None]:
# make one prediction
print(NB_model.predict([[50]]))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', cbar=False)

In [None]:
# metrics from a confusion matrix
print(classification_report(y_test, y_pred))

## Two predictor variables

In [None]:
penguins_subset = penguins[['species', 'bill_length_mm', 'flipper_length_mm']].dropna()
penguins_subset.head()
X = penguins_subset[['bill_length_mm', 'flipper_length_mm']] #explanatory variables
y = penguins_subset['species'] #response variable (Boolean)

In [None]:
# main code for Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=301)
NB_model = GaussianNB()
NB_model.fit(X_train, y_train)
y_pred = NB_model.predict(X_test)

In [None]:
# make one prediction
print(NB_model.predict([[50, 195]]))

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', cbar=False)

In [None]:
# metrics from a confusion matrix
print(classification_report(y_test, y_pred))