In [1]:
# Note: This code assumes you've already installed anaconda, from anaconda.org
# 
# These three are standard for plotting and loading data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sci-kit learn is a super useful library! It has a lot so people generally load 
# needed functions at will
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize, StandardScaler
# This is installed separately from Anaconda usually but has gotten way easier
# to install; you can now use pipe.  In other words, in a terminal, type:
# 
#    pip install xgboost
# 
# https://xgboost.readthedocs.io/en/latest/build.html
import xgboost



Normally pandas will auto-detect column names if they're included in the file as a header, but because the Pima Indians Diabetes dataset is a little bit older


columns = ['Number of times pregnant',
           'Plasma glucose concentration a 2 hours in an oral glucose tolerance test',
           'Diastolic blood pressure (mm Hg)', 
           'Triceps skinfold thickness (mm)',
           '2-Hour serum insulin (mu U/ml)',
           'Body mass index (weight in kg/(height in m)^2)',
           'Diabetes pedigree function',
           'Age',
           'Y']


In [2]:
# Note: "df" is by convention a super common name for a dataframe object

columns = ['# times pregnant',
           'Plasma glucose concentration',
           'Diastolic blood pressure', 
           'Triceps skinfold thickness',
           '2-Hour serum insulin',
           'Body mass index',
           'Diabetes pedigree function',
           'Age',
           'Y']
df = pd.read_csv('pima-indians-diabetes-data.txt', names=columns)



In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   # times pregnant              768 non-null    int64  
 1   Plasma glucose concentration  768 non-null    int64  
 2   Diastolic blood pressure      768 non-null    int64  
 3   Triceps skinfold thickness    768 non-null    int64  
 4   2-Hour serum insulin          768 non-null    int64  
 5   Body mass index               768 non-null    float64
 6   Diabetes pedigree function    768 non-null    float64
 7   Age                           768 non-null    int64  
 8   Y                             768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.head()

Unnamed: 0,# times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skinfold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,Y
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X = df[df.columns.drop('Y')]
y = df['Y']

In [6]:
# Set this number to set the random state
random_state = 1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=random_state)

# Logistic Regression

Don't overlook it in the rush to do Random Forest/Gradient Boosting Trees

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

For regression tasks look at Linear Regression

In [9]:
log_model = LogisticRegression(max_iter = 1000)

log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)
y_prob_log = log_model.predict_proba(X_test)


accuracy = accuracy_score(y_test, y_pred_log)
f1 = f1_score(y_test, y_pred_log)
roc_auc = roc_auc_score(y_test, y_prob_log[:,1])

(accuracy, f1, roc_auc)

(0.7792207792207793, 0.6458333333333333, 0.8416896235078053)

# XGBoost

a.k.a. "X-treme Gradient Boosting"

In [11]:
# If you look inside the xgboost model, there are a ton of "hyperparameters" to set which 
# will affect the performance of the model. 



xgb_model = xgboost.XGBClassifier(n_estimators=100)

In [12]:
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
y_prob =  xgb_model.predict_proba(X_test)


accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob[:,1])

(accuracy, f1, roc_auc)

(0.7987012987012987, 0.7102803738317757, 0.8642791551882462)