In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# import the iris dataset
iris = pd.read_csv('datasets/iris.csv')

# select columns for training and testing
X = iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',]]
y = iris['Species']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=99)

X_train

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
112,6.8,3.0,5.5,2.1
97,6.2,2.9,4.3,1.3
86,6.7,3.1,4.7,1.5
67,5.8,2.7,4.1,1.0
126,6.2,2.8,4.8,1.8
...,...,...,...,...
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8
68,6.2,2.2,4.5,1.5
35,5.0,3.2,1.2,0.2


In [2]:
from sklearn.linear_model import LogisticRegression

# classify the data using logistic regression
logistic_regression = LogisticRegression(random_state=0).fit(X_train, y_train)
print('Train accuracy:', logistic_regression.score(X_train, y_train))
print('Test accuracy', logistic_regression.score(X_test, y_test))

Train accuracy: 0.9821428571428571
Test accuracy 0.9473684210526315


In [3]:
from sklearn.decomposition import PCA

# fit the data into a principal component analysis
pca = PCA(n_components=2).fit(X_train)
X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(X_test)

X_train_reduced

array([[ 2.19622396,  0.23146295],
       [ 0.67205534,  0.03098777],
       [ 1.24772949,  0.42395007],
       [ 0.2669811 , -0.31949498],
       [ 1.2913774 , -0.16930847],
       [ 2.64097743,  0.58639599],
       [ 0.16785524, -0.30438491],
       [-2.47906876,  0.63909537],
       [-2.47672418, -0.14896524],
       [-3.18852067, -0.52335592],
       [ 3.25435793,  1.40116004],
       [ 0.95369461,  0.48276905],
       [-2.59944578,  0.59463769],
       [-0.71202576, -1.00280897],
       [ 2.18287663,  0.14315914],
       [ 3.42221818,  0.58195916],
       [ 1.45300434, -0.56677622],
       [ 3.82029583,  0.29218426],
       [-2.73879691,  0.25245875],
       [ 2.00556838, -0.16329202],
       [ 1.48980346,  0.52401926],
       [ 0.5000821 , -0.6593731 ],
       [-2.64223986, -0.11782498],
       [-2.62064354,  1.17179743],
       [-2.51420629,  0.57246727],
       [-0.14006601, -0.25262627],
       [ 1.12723154,  0.29267201],
       [ 2.4538049 ,  0.31190316],
       [ 1.9385221 ,

In [4]:
# train a new classifier
logistic_regression = LogisticRegression(random_state=0).fit(X_train_reduced, y_train)
print('Train accuracy:', logistic_regression.score(X_train_reduced, y_train))
print('Test accuracy', logistic_regression.score(X_test_reduced, y_test))

Train accuracy: 0.9732142857142857
Test accuracy 0.9473684210526315


In [5]:
# check the percentage of the variation explained by each component
pca.explained_variance_ratio_ * 100

array([92.53592093,  5.18796309])