In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import random_projection
from time import time
%matplotlib inline

# Problem 1
Apply PCA to the cancer dataset to reduce the dimension of the feature space to each of 15, 10, and 5.    Are there any features or combinations of features for which PCA is not a suitable method to use?  Explain.  WARNING: remember to center your data (subtract the mean) and also normalize it. 

In [2]:
cancer = load_breast_cancer()
x = cancer.data
y = cancer.target
x_scaled = StandardScaler().fit_transform(x)
print(pd.DataFrame(data=x_scaled).describe())

                 0             1             2             3             4   \
count  5.690000e+02  5.690000e+02  5.690000e+02  5.690000e+02  5.690000e+02   
mean  -3.162867e-15 -6.530609e-15 -7.078891e-16 -8.799835e-16  6.132177e-15   
std    1.000880e+00  1.000880e+00  1.000880e+00  1.000880e+00  1.000880e+00   
min   -2.029648e+00 -2.229249e+00 -1.984504e+00 -1.454443e+00 -3.112085e+00   
25%   -6.893853e-01 -7.259631e-01 -6.919555e-01 -6.671955e-01 -7.109628e-01   
50%   -2.150816e-01 -1.046362e-01 -2.359800e-01 -2.951869e-01 -3.489108e-02   
75%    4.693926e-01  5.841756e-01  4.996769e-01  3.635073e-01  6.361990e-01   
max    3.971288e+00  4.651889e+00  3.976130e+00  5.250529e+00  4.770911e+00   

                 5             6             7             8             9   \
count  5.690000e+02  5.690000e+02  5.690000e+02  5.690000e+02  5.690000e+02   
mean  -1.120369e-15 -4.421380e-16  9.732500e-16 -1.971670e-15 -1.453631e-15   
std    1.000880e+00  1.000880e+00  1.000880e+00  1.

In [3]:
n_dimensions = [15,10,5]
for n in n_dimensions :
    pca = PCA(n_components=n)
    x_new = pca.fit_transform(x_scaled)
    lgr = LogisticRegression()
    score = cross_val_score(lgr,x_new,y)
    print('n = ' + str(n) + ' : ' + str(score))

n = 15 : [ 0.97368421  0.97368421  0.96825397]
n = 10 : [ 0.98947368  0.96842105  0.97354497]
n = 5 : [ 0.96842105  0.97368421  0.96825397]


# Problem 2
Apply three of your favorite classification methods to the full cancer data set and also to the PCA-reduced data.  Analyze and evaluate the performance (time and accuracy) for each combination.  

In [4]:
n = 10
pca = PCA(n_components=n)
x_new = pca.fit_transform(x_scaled)
pca_times = []
pca_scores = []
reg_times = []
reg_scores = []

gnb = GaussianNB()
start = time()
score = cross_val_score(gnb,x_new,y)
end = time()
pca_times.append(end-start)
pca_scores.append(np.average(score))
start = time()
score = cross_val_score(gnb,x_scaled,y)
end = time()
reg_times.append(end-start)
reg_scores.append(np.average(score))

lgr = LogisticRegression()
start = time()
score = cross_val_score(lgr,x_new,y)
end = time()
pca_times.append(end-start)
pca_scores.append(np.average(score))
start = time()
score = cross_val_score(lgr,x_scaled,y)
end = time()
reg_times.append(end-start)
reg_scores.append(np.average(score))

svc = SVC(probability=True)
start = time()
score = cross_val_score(svc,x_new,y)
end = time()
pca_times.append(end-start)
pca_scores.append(np.average(score))
start = time()
score = cross_val_score(svc,x_scaled,y)
end = time()
reg_times.append(end-start)
reg_scores.append(np.average(score))

print("           [ Naive Bayes       ,      Log Reg.        ,   SVM ]")
print("Time -")
print("     PCA : {}".format(pca_times))
print("     Reg : {}".format(reg_times))
print("Acc. -")
print("     PCA : {}".format(pca_scores))
print("     Reg : {}".format(reg_scores))

           [ Naive Bayes       ,      Log Reg.        ,   SVM ]
Time -
     PCA : [0.00645899772644043, 0.01373910903930664, 0.056427001953125]
     Reg : [0.0074460506439208984, 0.011440277099609375, 0.06028580665588379]
Acc. -
     PCA : [0.91916829109811571, 0.97714657012902617, 0.96308363501345962]
     Reg : [0.92970388935301218, 0.975392184164114, 0.9736377981992016]


# Problem 3
Find some aspect of your final project for which PCA is an appropriate dimension-reduction method.  Apply PCA and analyze the results and performance.  Compare to your results without PCA.  

In [5]:
import pandas as pd
import os
path = '../../../../Senior Project/DATA/'

train = []
test = []

# Walk through player files
for dir_path , dir_name , file_names in os.walk(path) :
    for name in file_names :
        # Grab avgs file
        if name[-4:] == 'avgs' :
            data = pd.read_csv(os.path.join(dir_path,name))
            if isinstance(train,list) :
                train = data.drop(['Unnamed: 0'],axis=1).as_matrix()
            else :
                train = np.vstack((train,data.drop(['Unnamed: 0'],axis=1).as_matrix()))

# From the way the data is saved, the last column is whether or not the player
#     is a score on how much of a contributor he was during the season.
x = train[:,:-1]
y = train[:,-1]

x_scaled = StandardScaler().fit_transform(x)

n = 10
pca = PCA(n_components=n)
x_new = pca.fit_transform(x_scaled)

lgr = LogisticRegression()
start = time()
score = cross_val_score(lgr,x_new,y)
end = time()
pca_time = end-start
pca_score = np.average(score)
start = time()
score = cross_val_score(lgr,x_scaled,y)
end = time()
reg_time = end-start
reg_score = np.average(score)

print('PCA -')
print('    Time  : {}'.format(pca_time))
print('    Score : {}'.format(pca_score))
print('Reg -')
print('    Time  : {}'.format(reg_time))
print('    Score : {}'.format(reg_score))

PCA -
    Time  : 0.036509037017822266
    Score : 0.8660235338931246
Reg -
    Time  : 0.035748958587646484
    Score : 0.8681764588219645


# Problem 4
Repeat what you did in the previous problem, but replacing PCA by a random projection. Try 5 different random projections and compare the results and performance. 

In [8]:
import pandas as pd
import os
path = '../../../../Senior Project/DATA/'

train = []
test = []

# Walk through player files
for dir_path , dir_name , file_names in os.walk(path) :
    for name in file_names :
        # Grab avgs file
        if name[-4:] == 'avgs' :
            data = pd.read_csv(os.path.join(dir_path,name))
            if isinstance(train,list) :
                train = data.drop(['Unnamed: 0'],axis=1).as_matrix()
            else :
                train = np.vstack((train,data.drop(['Unnamed: 0'],axis=1).as_matrix()))

# From the way the data is saved, the last column is whether or not the player
#     is a score on how much of a contributor he was during the season.
x = train[:,:-1]
y = train[:,-1]

x_scaled = StandardScaler().fit_transform(x)

rand_proj_times = []
rand_proj_scores = []
reg_times = []
reg_scores = []

for i in range(5) :
    n = 10
    rand_proj = random_projection.SparseRandomProjection()
    x_new = rand_proj.fit_transform(x_scaled)

    lgr = LogisticRegression()
    start = time()
    score = cross_val_score(lgr,x_new,y)
    end = time()
    rand_proj_times.append(end-start)
    rand_proj_scores.append(np.average(score))
    start = time()
    score = cross_val_score(lgr,x_scaled,y)
    end = time()
    reg_times.append(end-start)
    reg_scores.append(np.average(score))

print('Rand. Proj -')
print('    Time  : {}'.format(rand_proj_times))
print('    Score : {}'.format(rand_proj_scores))
print('Reg        -')
print('    Time  : {}'.format(reg_times))
print('    Score : {}'.format(reg_scores))

ValueError: eps=0.100000 and n_samples=3717 lead to a target dimension of 7046 which is larger than the original space with n_features=19