### Principal Component Analysis (PCA) using Scikit Learn

### Importing the libraries 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
dataset = pd.read_csv('Wine.csv')
dataset.head(5)

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


### Separting the features and labels

In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
X.shape

(178, 13)

### Splitting the datset into the Training set and Test Set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
X_train

array([[ 0.87668336,  0.79842885,  0.64412971, ...,  0.0290166 ,
        -1.06412236, -0.2059076 ],
       [-0.36659076, -0.7581304 , -0.39779858, ...,  0.0290166 ,
        -0.73083231, -0.81704676],
       [-1.69689407, -0.34424759, -0.32337513, ...,  0.90197362,
         0.51900537, -1.31256499],
       ...,
       [-0.70227477, -0.68615078, -0.65828065, ...,  0.46549511,
         0.51900537, -1.31256499],
       [ 1.13777093, -0.62316862, -0.91876272, ..., -0.18922266,
         1.03282752,  0.80164614],
       [ 1.4610222 ,  0.12361993,  0.42085937, ..., -1.45501034,
        -1.2168803 , -0.2719767 ]])

### Applying PCA

In [8]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X_train)

PCA(n_components=0.95)

In [9]:
print("pca components :\n",pca.components_)
print("pca explained variance :\n\n",pca.explained_variance_)

pca components :
 [[ 0.12959991 -0.24464064 -0.01018912 -0.24051579  0.12649451  0.38944115
   0.42757808 -0.30505669  0.30775255 -0.11027186  0.30710508  0.37636185
   0.2811085 ]
 [-0.49807323 -0.23168482 -0.31496874  0.02321825 -0.25841951 -0.1006849
  -0.02097952 -0.0399057  -0.06746036 -0.53087111  0.27161729  0.16071181
  -0.36547344]
 [ 0.1383815  -0.08422378 -0.63988217 -0.62195017 -0.02997765 -0.17115651
  -0.12464239 -0.17100464 -0.15155321  0.14612801 -0.10692701 -0.17144241
   0.13178214]
 [-0.24199813 -0.03501265 -0.00978418  0.0922443   0.82788074 -0.16969861
  -0.12346501 -0.39521252  0.06413801 -0.10827243 -0.05295673 -0.14487111
  -0.07973043]
 [-0.17279973  0.58431883 -0.26362755 -0.02622374 -0.08753303  0.16632648
   0.10686668 -0.10735201  0.52323021  0.02931966 -0.38019616  0.14033771
  -0.24774953]
 [-0.17873289 -0.50041745 -0.16064973  0.0559492   0.07810127  0.0217019
  -0.04491501  0.45316488  0.58035398  0.27220968  0.00633457 -0.25220324
  -0.01400986]
 [-0.0

In [10]:
X_train_trns = pca.transform(X_train)

print("original shape:   ", X.shape)
print("transformed shape:", X_train_trns.shape)

original shape:    (178, 13)
transformed shape: (142, 10)


In [11]:
X_train_transformed = pca.fit_transform(X_train)
X_test_transformed = pca.transform(X_test)

### Training the Logistic Regression model on the Training set

In [12]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_transformed, y_train)

LogisticRegression(random_state=0)

### Making confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test_transformed)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix :\n', cm)
print('Accuracy Score :',accuracy_score(y_test, y_pred))

Confusion Matrix :
 [[14  0  0]
 [ 0 16  0]
 [ 0  0  6]]
Accuracy Score : 1.0


### Visualising the Training set results

In [14]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train_transformed, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

ValueError: X has 2 features per sample; expecting 10

### Visualising the Test set Results

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test_transformed, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()