In [None]:
# Classification- Predicted variable is Categorical
# i.e Fraud/Not Fraud, Default/Not default
import pandas
data = pandas.read_csv("https://modcom.co.ke/data/datasets/pima.csv")
data.head(20)

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,Diabetic
1,1,85,66,29,0,26.6,0.351,31,Not Diabetic
2,8,183,64,0,0,23.3,0.672,32,Diabetic
3,1,89,66,23,94,28.1,0.167,21,Not Diabetic
4,0,137,40,35,168,43.1,2.288,33,Diabetic
5,5,116,74,0,0,25.6,0.201,30,Not Diabetic
6,3,78,50,32,88,31.0,0.248,26,Diabetic
7,10,115,0,0,0,35.3,0.134,29,Not Diabetic
8,2,197,70,45,543,30.5,0.158,53,Diabetic
9,8,125,96,0,0,0.0,0.232,54,Diabetic


In [None]:
data.describe()

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [None]:
#data.isnull() 

In [None]:
data.groupby('Outcome').size()

Outcome
Diabetic        268
Not Diabetic    500
dtype: int64

In [None]:
subset = data[['Glucose', 'Insulin',	'BMI',	'DiabetesPedigreeFunction',	'Age',	'Outcome']]

In [None]:
# Step 1: Split to X - independent  and Y - dependent
array = subset.values
X = array[:, 0:5]    # 8 is not counted here
Y = array[:, 5]

In [None]:
# Using Adaboost Classifier / Random Forest for Feature Elimination # identify features that affect the outcome
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFE
estimator = AdaBoostClassifier(random_state=0, n_estimators=100)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)

filter = selector.support_
ranking = selector.ranking_

print("Mask data: ", filter)
print("Ranking: ", ranking)

Mask data:  [ True  True  True  True  True]
Ranking:  [1 1 1 1 1]


In [None]:
#Oversampling Techniques- increase ML accuracy
from imblearn.over_sampling import SMOTE
# transform the dataset
oversample = SMOTE()
oversampledX, oversampledY = oversample.fit_resample(X, Y)
#len(oversampledY) # check if data has increased


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# define standard scaler used for standization around the mean of 0
scaler = StandardScaler()
# define minmax scaler used for standization between -1 to 1 or 0, 1
# transform data
scaledX = scaler.fit_transform(oversampledX)

In [None]:
# Step 2: Split to training set and testing set
# we have 768 records, 70% is used training the model, 30% used in testing the model(hideout)
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(scaledX, oversampledY, test_size=0.3, random_state=42)
# X_train consist of   Children	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age - 70%
# Y_train consist of Outcome - 70%

# X_test consist of Children	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age - 30%
# Y_test consist of Outcome - 30%

In [None]:
# Step 2a: Cross Validation Of Algorithms/Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('RFR', RandomForestClassifier()))
models.append(('SVM', SVC())) 
models.append(('NB', GaussianNB())) 
# KFOLD - Cross Validation
for name, model in models:
  kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
  cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring = 'accuracy')
  print(name, cv_results.mean())

KNN 0.7757142857142857
DT 0.7428571428571429
GB 0.8157142857142856
RFR 0.8128571428571428
SVM 0.7742857142857142
NB 0.7371428571428572


In [None]:
# Step 3: Load Machine Learning Models/Algorithms
model = RandomForestClassifier()
model.fit(X_train, Y_train)
print('Training happens at this point..')

Training happens at this point..


In [None]:
# Step 4: Test your model
predictions = model.predict(X_test)
#print('Model Predictions ', predictions)
#print('Actual Values ', Y_test)

In [None]:
# Step 5 : Get metrics using accuracy
from sklearn.metrics import accuracy_score
print('Score  ', accuracy_score(Y_test, predictions))

# Problem - 1. Unbalanced data, 2. Data is not well standadized, 3. Few Records , 4. Some features might be noise.

Score   0.7566666666666667


In [None]:
# At 79% Confidence
person = [[144, 100, 23, 0.677, 42]]
outcome = model.predict(person)
print('The outcome is likely to be ', outcome)

The outcome is likely to be  ['Diabetic']
