# Evaluate Logistic Regression Model on raw data

In [27]:
import sklearn
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# synthetic classification dataset
from sklearn.datasets import make_classification

In [28]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)

# summarize the dataset
print(X.shape, y.shape)

# create dataframe
df = pd.DataFrame({'Technique':[], 'Accuracy Mean':[], ' Accuracy SD':[]});

(1000, 20) (1000,)


In [29]:
# define the model
model = LogisticRegression()

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.824 (0.034)


In [30]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [31]:
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
df.loc[len(df.index)] = ['PCA', mean(n_scores), std(n_scores)] 

Accuracy: 0.824 (0.034)


In [32]:
from sklearn.decomposition import TruncatedSVD

In [33]:
# define the pipeline
steps = [('svd', TruncatedSVD(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
df.loc[len(df.index)] = ['SVD', mean(n_scores), std(n_scores)] 

Accuracy: 0.824 (0.034)


In [34]:
from sklearn.random_projection import GaussianRandomProjection

In [35]:
# define the pipeline
steps = [('rp', GaussianRandomProjection(n_components=10, random_state=1)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
df.loc[len(df.index)] = ['Random Projection', mean(n_scores), std(n_scores)] 

Accuracy: 0.823 (0.035)


In [36]:
from sklearn.manifold import LocallyLinearEmbedding

In [37]:
# define the pipeline
steps = [('lle', LocallyLinearEmbedding(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
df.loc[len(df.index)] = ['LLE', mean(n_scores), std(n_scores)] 

Accuracy: 0.886 (0.028)


In [38]:
df

Unnamed: 0,Technique,Accuracy Mean,Accuracy SD
0,PCA,0.824333,0.033831
1,SVD,0.824333,0.033831
2,Random Projection,0.822667,0.034635
3,LLE,0.886,0.028355
