In [1]:
import pandas as pd

# read synthetic data
data = pd.read_csv('datasets/real_life_regressor.csv')
data.head()

Unnamed: 0,h_1,h_2,x_1,x_2,x_3,y
0,-0.507907,-0.426783,-0.348796,-0.15677,-0.197266,0
1,0.208406,0.57924,-0.049915,-0.135873,-0.057563,1
2,-0.408581,0.486722,-0.734377,-0.359439,-0.515557,1
3,-0.686362,0.810403,-0.127156,0.049691,-0.722663,1
4,-0.270678,0.418167,-0.747838,-0.360587,-0.447434,1


In [2]:
from sklearn.decomposition import FactorAnalysis, PCA, FastICA
from sklearn.manifold import LocallyLinearEmbedding

# use FA to extract features from the variables and create a dataframe for it
fa_latent = FactorAnalysis(n_components=2, random_state=0).fit_transform(data[['x_1', 'x_2', 'x_3']])
fa_latent = pd.DataFrame(fa_latent, columns=['z_1', 'z_2'])
fa_latent = pd.concat([data, fa_latent], axis=1)

# use PCA to extract features from the variables and create a dataframe for it
pca_latent = PCA(n_components=2, random_state=0).fit_transform(data[['x_1', 'x_2', 'x_3']])
pca_latent = pd.DataFrame(pca_latent, columns=['z_1', 'z_2'])
pca_latent = pd.concat([data, pca_latent], axis=1)

# use ICA to extract features from the variables and create a dataframe for it
ica_latent = FastICA(n_components=2, random_state=0).fit_transform(data[['x_1', 'x_2', 'x_3']])
ica_latent = pd.DataFrame(ica_latent, columns=['z_1', 'z_2'])
ica_latent = pd.concat([data, ica_latent], axis=1)

# use LLE to extract features from the variables and create a dataframe for it
lle_latent = LocallyLinearEmbedding(n_components=2, random_state=0).fit_transform(data[['x_1', 'x_2', 'x_3']])
lle_latent = pd.DataFrame(lle_latent, columns=['z_1', 'z_2'])
lle_latent = pd.concat([data, lle_latent], axis=1)

# import the latent space created with autoencoder
ae_latent = pd.read_csv('datasets/encoded_latent.csv')

ae_latent.head()

Unnamed: 0,h_1,h_2,x_1,x_2,x_3,y,z_1,z_2
0,-0.507907,-0.426783,-0.348796,-0.15677,-0.197266,0,12.530251,0.0
1,0.208406,0.57924,-0.049915,-0.135873,-0.057563,1,9.211573,0.470352
2,-0.408581,0.486722,-0.734377,-0.359439,-0.515557,1,19.174109,0.0
3,-0.686362,0.810403,-0.127156,0.049691,-0.722663,1,14.096041,0.0
4,-0.270678,0.418167,-0.747838,-0.360587,-0.447434,1,18.695078,0.0


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

def apply_classifiers(latent):

    # create training and testing sets
    X, y = np.array(latent[['z_1', 'z_2']]), np.array(latent['y'].tolist())
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

    # using KNNs for classification
    knn = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
    print("KNN test score:", knn.score(X_test, y_test))
    print("KNN train score:", knn.score(X_train, y_train))

    # using SVMs for classification
    svm = SVC().fit(X_train, y_train)
    print("SVM test score:", svm.score(X_test, y_test))
    print("SVM train score:", svm.score(X_train, y_train))

    # using random forests for classification
    rfc = RandomForestClassifier(max_depth=5, random_state=0).fit(X_train, y_train)
    print("RF test score:", rfc.score(X_test, y_test))
    print("RF train score:", rfc.score(X_train, y_train))

    # using logistic regression for classification
    lrc = LogisticRegression(random_state=0).fit(X_train, y_train)
    print("LR test score:", lrc.score(X_test, y_test))
    print("LR train score:", lrc.score(X_train, y_train))

In [4]:
for latent_space, name in zip([pca_latent, fa_latent, ica_latent, lle_latent, ae_latent],
                              ['PCA', 'FA', 'ICA', 'LLE', 'AE']):
    print(f'\n{name}')
    apply_classifiers(latent_space)


PCA
KNN test score: 0.9
KNN train score: 0.9053333333333333
SVM test score: 0.868
SVM train score: 0.896
RF test score: 0.892
RF train score: 0.94
LR test score: 0.668
LR train score: 0.688

FA
KNN test score: 0.828
KNN train score: 0.8893333333333333
SVM test score: 0.828
SVM train score: 0.8786666666666667
RF test score: 0.848
RF train score: 0.9066666666666666
LR test score: 0.628
LR train score: 0.68

ICA
KNN test score: 0.916
KNN train score: 0.936
SVM test score: 0.876
SVM train score: 0.9
RF test score: 0.872
RF train score: 0.924
LR test score: 0.748
LR train score: 0.7813333333333333

LLE
KNN test score: 0.88
KNN train score: 0.9266666666666666
SVM test score: 0.852
SVM train score: 0.8906666666666667
RF test score: 0.912
RF train score: 0.9533333333333334
LR test score: 0.464
LR train score: 0.5106666666666667

AE
KNN test score: 0.832
KNN train score: 0.8653333333333333
SVM test score: 0.8
SVM train score: 0.8306666666666667
RF test score: 0.872
RF train score: 0.9053333333