# Challenge
Use RBM to perform feature extraction on an image-based dataset that you find or create. If you go this route, present the features you extract and explain why this is a useful feature extraction method in the context you’re operating in. DO NOT USE either the MNIST digit recognition database or the iris data set. They’ve been worked on in very public ways very very many times and the code is easily available. (However, that code could be a useful resource to refer to)

In [29]:
import numpy as np
import pandas as pd

from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [2]:
!cd ../../data_sets/f & ls

30
Neural_Net_Challenge.ipynb
supervised_neural_network.ipynb
test_zip.zip
train_zip.zip


The system cannot find the path specified.


data source: https://www.kaggle.com/sirsolim/images-of-primitive-3d-objects-for-classification/

In [3]:
!cd ../../data_sets/3d_images/ & ls

features_csv.csv
features_npy.npy
labels_csv.csv
labels_npy.npy


In [4]:
data_df = pd.read_csv(r'../../data_sets/3d_images/features_csv.csv')
data_df.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.1443,0.1444,0.1445,0.1446,0.1447,0.1448,0.1449,0.1450,0.1451,0.1452
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data_df.shape

(479999, 1600)

In [6]:
data2_df = data_df.astype('bool').iloc[0:50_000,:]

In [7]:
y = pd.read_csv(r'../../data_sets/3d_images/labels_csv.csv')
y = y[0:50_000]

In [8]:
y.head()

Unnamed: 0,0,0.1,1,0.2
0,0,0,1,0
1,0,0,1,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [9]:
# Un-OHE y
y = y.idxmax(1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data2_df, y, test_size=0.33)

In [11]:
bern_rbm = BernoulliRBM(learning_rate=.05, n_iter=20, verbose=1)
bern_rbm.fit_transform(X_train, y_train)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -89.07, time = 79.90s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -96.52, time = 78.69s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -76.91, time = 80.34s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -62.70, time = 77.44s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -55.45, time = 79.38s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -59.09, time = 76.85s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -48.45, time = 78.05s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -49.04, time = 77.48s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -53.79, time = 77.28s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -56.59, time = 74.99s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -55.86, time = 75.19s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -54.04, time = 75.37s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -53.82, time = 75.24s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -45.61, time = 75.34s
[BernoulliRBM] 

array([[1.03082748e-46, 9.30662375e-27, 4.11849241e-29, ...,
        7.18410357e-27, 1.13055316e-36, 1.37203019e-03],
       [3.17321650e-36, 1.38326035e-07, 1.59648740e-09, ...,
        9.00356249e-20, 4.93128901e-47, 3.74385687e-54],
       [9.80285791e-23, 6.28860634e-11, 1.77132014e-09, ...,
        6.68386578e-35, 3.01741813e-59, 2.41167442e-55],
       ...,
       [4.38831131e-29, 3.28476690e-27, 3.98546694e-23, ...,
        3.19566011e-18, 9.99965006e-01, 5.72068534e-08],
       [3.20629062e-55, 1.29743238e-43, 4.70171620e-69, ...,
        2.28043329e-53, 1.37314004e-68, 1.00000000e+00],
       [6.95061108e-41, 1.94289416e-29, 1.68003512e-36, ...,
        2.60946920e-17, 9.99999995e-01, 1.00000000e+00]])

In [None]:
raw_logit = LogisticRegression(multi_class='multinomial', tol=1e-2, solver='saga', C=400)
raw_logit.fit(X_train, y_train)
raw_logit_pred = raw_logit.predict(X_test)

In [35]:
raw_class_report = pd.DataFrame(classification_report(y_true=y_test, y_pred = raw_logit_pred, output_dict=True))

In [None]:
FE_logit = LogisticRegression(multi_class='multinomial', tol=1e-3, solver='saga', C=400)

In [None]:
nn_pipe = Pipeline(steps=[('nn_feat_ext', bern_rbm), ('logit', FE_logit)]) 
nn_pipe.fit(X_train, y_train)

In [32]:
FE_pred = nn_pipe.predict(X_test)

In [34]:
nn_class_report = pd.DataFrame(classification_report(y_true=y_test, y_pred=FE_pred, output_dict=True))

In [36]:
raw_class_report

Unnamed: 0,0,0.1,0.2,1,accuracy,macro avg,weighted avg
f1-score,0.996963,0.983567,0.984381,0.986128,0.987758,0.98776,0.987757
precision,0.996842,0.983092,0.984986,0.986128,0.987758,0.987762,0.987757
recall,0.997084,0.984043,0.983776,0.986128,0.987758,0.987757,0.987758
support,4115.0,4136.0,4068.0,4181.0,0.987758,16500.0,16500.0


In [37]:
nn_class_report

Unnamed: 0,0,0.1,0.2,1,accuracy,macro avg,weighted avg
f1-score,0.894873,0.660503,0.692022,0.725938,0.745091,0.743334,0.743305
precision,0.86694,0.673793,0.697552,0.730332,0.745091,0.742154,0.742147
recall,0.924666,0.647727,0.686578,0.721598,0.745091,0.745142,0.745091
support,4115.0,4136.0,4068.0,4181.0,0.745091,16500.0,16500.0
