In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import tensorflow as tf
from tensorflow import keras

In [6]:
def fetch_data(drop_some=True):

    df = pd.read_csv("../data/abnormal_writeout.data.csv")

    if drop_some:
        # trascurare da ACC a UVM
        start_drop = df.columns.get_loc("ACC")
        end_drop = df.columns.get_loc("UVM")
        cols = np.arange(start_drop, end_drop + 1)
        df.drop(df.columns[cols], axis=1, inplace=True)

        # trascurare old_phylo_factor e la prima colonna
        df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
        df.drop("Unnamed: 0", axis=1, inplace=True)

    return df


def separate_data(df):
    resp = df["response"].to_numpy()
    occ = df["occ_total_sum"].to_numpy()
    age = df["oldest_phylostratum"].to_numpy()
    conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1).to_numpy()
    return occ, age, conf, resp


def get_PCA(X, expl_var=0.95, plot=False):

    pca_test = PCA()
    pca_test.fit(X)
    cumsum = np.cumsum(pca_test.explained_variance_ratio_)
    d = np.argmax(cumsum >= expl_var) + 1

    # Apply PCA with d components
    pca_apply = PCA(n_components=d)
    X_PCA = pca_apply.fit_transform(X)

    print("Using {} principal components.".format(d))
    print(f"Reduced features by {(784-d)/784*100:.2f} percent.")
    if plot == True:
        sns.heatmap(pd.DataFrame(X_PCA).corr())
        plt.show()

    return X_PCA

In [10]:
df = fetch_data()
df.dropna(inplace=True)
X_occ, X_age, X_conf, Y = separate_data(df)
X = np.c_[X_occ, X_age, X_conf]

In [13]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [42]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.metrics import CategoricalCrossentropy, AUC, Recall

loss = CategoricalCrossentropy()
metric = AUC
optimizer = SGD()

def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(1, activation='relu'))
    # Compile model
    model.compile(loss=loss, optimizer=optimizer, metrics=[AUC(), Recall()])
    return model

In [43]:
nn_clf = create_baseline()
nn_clf.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 60)                5100      
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 61        
Total params: 5,161
Trainable params: 5,161
Non-trainable params: 0
_________________________________________________________________
