In [None]:
!pip install autokeras

## Auto ML (Automating finding the best model)
### In the classification problem of predicting the credit risk use AutoKeras to arrive at best Neural Network architecture.


In [None]:
from sklearn.datasets import fetch_openml
import autokeras as ak

df = fetch_openml("credit-g")
X = df["data"]
Y_raw = df["target"]

classifier = ak.StructuredDataClassifier(overwrite=True, max_trials=5)

In [None]:
dir(df)

In [None]:
print(df['DESCR'])

In [None]:
#TBD Split into train and test set
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y_raw)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
#TBD Fit on train set
#TBD Evaluate on test set 

classifier.fit(X_train, Y_train)
classifier.evaluate(X_test, Y_test)

## TBD: Show the best architecture found by autokeras

In [None]:
model = classifier.export_model()
model.summary()

## TBD: Could you beat the evaluation score of above architecture by any other manually selected model (including non neural net classifiers)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

def model_param_optimiser(models):
    searches = []
    top = 0
    for model in models:
        gs = GridSearchCV(model[0], model[1], scoring='neg_mean_squared_error')
        gs.fit(X_train, Y_train)
        searches.append(gs)
    for i in range(len(searches)):
        if (abs(searches[i].best_score_) < abs(searches[top].best_score_)):
            top = i
    print("Top Score:" + str(searches[top].best_score_))
    print("Model:" + str(models[top][0]))
    print("Params:" + str(searches[top].best_params_))

no_param = {}
parameters_knn = {'n_neighbors':[4,5,6], 'p':[1,2]}
parameters_xgboost = {'n_estimators': [50,100,200], 'max_depth': [2,3,4,5,6,7], 'subsample': [0.9, 1.0, 1.1] }
models = [(LinearRegression(), no_param), (KNeighborsRegressor(), parameters_knn), (GradientBoostingRegressor(), parameters_xgboost)]

model_param_optimiser(models)

In [None]:
model = GradientBoostingClassifier(n_estimators=50, max_depth=3, subsample=1)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

# Therefor 0.76666 ~ 0.767 is better than autokeras' 0.736

## Auto Data Understanding
### [Tensorflow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv)

## Auto Data Exploration
### [pandas-profiling](https://github.com/pandas-profiling/pandas-profiling)