# PCA Homework

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Part 1

In [2]:
#loading iris dataset and standard scaler
iris = load_iris()
scaler = StandardScaler()

In [3]:
#created dataframe to split data into features & label
X, y = pd.DataFrame(data=iris.data, columns=iris.feature_names), pd.DataFrame(data=iris.target, columns=["iris_type"])
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
#using scaler to transform iris data
scaled_X = scaler.fit_transform(X)

#storing targets 
labels = y

In [5]:
#creating pca object
pca = PCA(n_components=2)
pca.fit(scaled_X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [6]:
#storing results of pca.transform()
X_with_pca = pca.transform(scaled_X)

In [7]:
# Enumerate through pca.explained_variance_ratio_ to see the amount of variance captured by each Principal Component
for ind, var in enumerate(pca.explained_variance_ratio_):
    print("Explained Variance for Principal Component {}: {}".format(ind, var))

Explained Variance for Principal Component 0: 0.729624454133
Explained Variance for Principal Component 1: 0.228507617867


In [8]:
data = {'Principle Component': ['PCA 1','PCA 2'],
        'Variance Explained': [var for ind, var in enumerate(pca.explained_variance_ratio_)]
        }
df = pd.DataFrame(data, columns = ['Principle Component', 'Variance Explained'])

df

Unnamed: 0,Principle Component,Variance Explained
0,PCA 1,0.729624
1,PCA 2,0.228508


In [9]:
reg_X_train, reg_X_test, reg_y_train, reg_y_test = train_test_split(scaled_X, labels, test_size=0.25, random_state=0)

In [10]:
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(X_with_pca, labels, test_size=0.25, random_state=0)

In [11]:
reg_clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
pca_clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
reg_clf.fit(reg_X_train, reg_y_train)
pca_clf.fit(pca_X_train, pca_y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
reg_pred = reg_clf.predict(reg_X_test)

pca_pred = pca_clf.predict(pca_X_test)

In [13]:
print("Accuracy for regular model: {}".format(accuracy_score(reg_y_test, reg_pred)))
print("Accuracy for model with PCA: {}".format(accuracy_score(pca_y_test, pca_pred)))

Accuracy for regular model: 0.973684210526
Accuracy for model with PCA: 0.868421052632


It looks the regular model was more accurate than the PCA model based upon these scores.

## Part Two

In [14]:
#reading in dataset
df = pd.read_csv('Datasets/Wholesale customers data.csv')
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [15]:
channel = df[['Channel']]

In [16]:
df.drop(['Channel', 'Region'], axis=1)

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,12669,9656,7561,214,2674,1338
1,7057,9810,9568,1762,3293,1776
2,6353,8808,7684,2405,3516,7844
3,13265,1196,4221,6404,507,1788
4,22615,5410,7198,3915,1777,5185
5,9413,8259,5126,666,1795,1451
6,12126,3199,6975,480,3140,545
7,7579,4956,9426,1669,3321,2566
8,5963,3648,6192,425,1716,750
9,6006,11093,18881,1159,7425,2098


In [18]:
scaled_X_2 = scaler.fit_transform(df)

#storing targets 
labels = channel

In [19]:
#creating pca object
pca = PCA(n_components=2)
pca.fit(scaled_X_2)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [20]:
#storing results of pca.transform()
X_with_pca_2 = pca.transform(scaled_X_2)

In [21]:
# Enumerate through pca.explained_variance_ratio_ to see the amount of variance captured by each Principal Component
for ind, var in enumerate(pca.explained_variance_ratio_):
    print("Explained Variance for Principal Component {}: {}".format(ind, var))

Explained Variance for Principal Component 0: 0.387501229116
Explained Variance for Principal Component 1: 0.22374587951
