In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import time
import warnings
import json
warnings.filterwarnings("ignore")

# Load Projected Data

In [2]:
# dataset 1 - digit recognition
dfname = "digit_recognition"
X_raw = pd.read_csv("%s_raw_X.csv" % dfname, index_col=0)
X_pca = pd.read_csv("%s_PCA_transformed_X_full.csv" % dfname, index_col=0)
X_ica = pd.read_csv("%s_ICA_transformed_X_300_components_sorted_by_kurtosis.csv" % dfname, index_col=0, header=None)
X_rp = pd.read_csv("%s_Randomized Projection_transformed_X_full.csv" % dfname, index_col=0)
X_fa = pd.read_csv("%s_Factor Analysis_transformed_X_full.csv" % dfname, index_col=0)
Y = pd.read_csv("%s_corresponding_Y.csv" % dfname, index_col=0, header=None)

components = [5, 10] + list(range(50, 251, 50)) # list of components to iterate neural network

In [3]:
# One hot encode target values
one_hot = OneHotEncoder(categories='auto')

Y = one_hot.fit_transform(Y.values.reshape(-1, 1)).todense()

# Neural Network Estimator

In [4]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(50), random_state=1, max_iter=300)

scoring = ['f1_macro']

# Neural Network on Raw Data 

In [5]:
np.random.seed(5)
score = cross_validate(clf, X_raw, Y, scoring=scoring,
                         cv=3, return_train_score=True)
raw_cv_score = score['test_f1_macro'].mean()
raw_fit_time = score['fit_time'].mean()

# Neural Network on Projected Data 

In [None]:
projections = [
    ('PCA', X_pca),
    ('ICA', X_ica),
    ('RP', X_rp),
    #('FA', X_fa),
]

scores = {}
fit_time = {}

for n in components:
    for name, X in projections:
        np.random.seed(0)
        score = cross_validate(clf, X.iloc[:, :n], Y, scoring=scoring,
                               cv=3, return_train_score=True)
        if (name not in scores) or (name not in fit_time):
            scores[name] = []
            fit_time[name] = []
        scores[name].append(score['test_f1_macro'].mean())
        fit_time[name].append(score['fit_time'].mean())
        print("--- finish neural network on %s projected data of %d components ---" % (name, n))

with open('%s_NN_scores_on_projected_data.json'%(dfname), 'w') as fp:
    json.dump(scores, fp)

In [None]:
def plotting (title, ylabel, df, base, projections):
    plt.figure(figsize=(20, 20))
    fig, ax = plt.subplots()
    for pname, p in projections:
        ax.plot(components, df[pname], label = pname)
    ax.axhline(base, color='navy', alpha=0.5, linestyle='--', label = 'Raw data')
    plt.xlabel("Number of components")
    plt.ylabel(ylabel)    
    plt.title(title)
    ax.legend()
    
plotting ("Neural Network Performance on Projected Data", "Three fold cross validation F1 score", scores, raw_cv_score, projections)
plotting ("Neural Network Fitting Time on Projected Data", "Average fit time in seconds", fit_time, raw_fit_time, projections)
plt.show()