In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump, load
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# New learning model from: https://www.kaggle.com/alexisbcook/xgboost
from xgboost import XGBRegressor

df = pd.read_csv('diabetes.csv')

my_imputer = SimpleImputer()

df = df[df["BMI"] != 0]
df = df[df["BloodPressure"] != 0]
df = df[df["SkinThickness"] != 0]
df = df[df["Glucose"] != 0]

In [2]:
# X = df.drop(columns=["Outcome"])
# y = df["Outcome"]

X = df.copy()
y = X.pop("Outcome")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
xgbr_model = XGBRegressor()
xgbr_model.fit(X_train, y_train,
              early_stopping_rounds=5,
              eval_set=[(X_test, y_test)],
              verbose=False)
predictions = xgbr_model.predict(X_test)

print("XGBRegressor (no parameters): " + str(mean_absolute_error(predictions, y_test)))

XGBRegressor (no parameters): 0.3003842558572504


In [4]:
xgbr_model = XGBRegressor(n_estimators=500)
xgbr_model.fit(X_train, y_train)
predictions = xgbr_model.predict(X_test)
print("XGBRegressor: " + str(mean_absolute_error(predictions, y_test)))

XGBRegressor: 0.32239702122534025


In [5]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

print(f'RandomForestRegressor: {score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)}')

RandomForestRegressor: 0.27289719626168224


In [6]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
X_test_predictions = model.predict(X_test)

print(f'DecisionTreeClassifier (no params): {mean_absolute_error(y_test, X_test_predictions)}')
# The lower the better in this case

DecisionTreeClassifier (no params): 0.27102803738317754


In [7]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_leaf_nodes in [10, 30, 70, 100, 200, 400, 800, 1000, 1250, 1500]:
    my_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
    print(f"Max leaf nodes: {max_leaf_nodes}  \t\t Mean Absolute Error:  {my_mae}")

Max leaf nodes: 10  		 Mean Absolute Error:  0.2903849700383366
Max leaf nodes: 30  		 Mean Absolute Error:  0.2892094245959216
Max leaf nodes: 70  		 Mean Absolute Error:  0.32968095391556557
Max leaf nodes: 100  		 Mean Absolute Error:  0.2523364485981308
Max leaf nodes: 200  		 Mean Absolute Error:  0.2803738317757009
Max leaf nodes: 400  		 Mean Absolute Error:  0.24299065420560748
Max leaf nodes: 800  		 Mean Absolute Error:  0.2803738317757009
Max leaf nodes: 1000  		 Mean Absolute Error:  0.2803738317757009
Max leaf nodes: 1250  		 Mean Absolute Error:  0.2616822429906542
Max leaf nodes: 1500  		 Mean Absolute Error:  0.2897196261682243


In [8]:
forest_model = RandomForestRegressor()
forest_model.fit(X_train, y_train)
mousse_predicts = forest_model.predict(X_test)
print(f'RandomForestRegressor (no params): {mean_absolute_error(y_test, mousse_predicts)}')

RandomForestRegressor (no params): 0.2896261682242991
