In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn import datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

# Let's compare our models thus far

###### First set up all the data

In [20]:
data_folder = "../Data"
train_file = "/adult.data.txt"
test_file = "/adult.test.txt"
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship",
        "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]

train_df = pd.read_csv(data_folder + train_file, names=cols, header=None)
test_df  = pd.read_csv(data_folder + test_file, names=cols, skiprows=1)

In [21]:

# One Hot Encoding
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)

In [22]:

# convert class to 0 or 1
train_df["class"] = train_df["class"].astype('category')
train_df["class"] = train_df["class"].cat.codes
test_df["class"]  = test_df["class"].astype('category')
test_df["class"]  = test_df["class"].cat.codes

In [38]:
X_train = train_df.drop("class", axis=1).as_matrix()
y_train = train_df["class"].as_matrix()
X_test = test_df.drop("class", axis=1).as_matrix()
y_test = test_df["class"].as_matrix()

In [24]:
clf = linear_model.RidgeClassifier()
n_folds = 10
scores = cross_val_score(clf, X_train, y_train, cv=n_folds)
scores

array([0.83236107, 0.83937346, 0.84490172, 0.82985258, 0.84336609,
       0.83753071, 0.83753071, 0.84213759, 0.84121622, 0.84398034])

## Here's the logistic regression average accuracy for 10 times 10 fold cross validation

In [46]:
res = 0.0
for i in range(10):
    X, y = shuffle(X_train, y_train, random_state=i)
    res += cross_val_score(clf, X, y, cv=n_folds).mean()
print(res/10.0)

0.8395381823613324


## Here's the Gaussian Process average accuracy for 10 times 10 fold cross validation
### Using 100 random samples for training

In [52]:
combineX_trainY_train = np.column_stack([X_train, y_train])
totalSamples = len(combineX_trainY_train)
subsetSize = 1000
subset = combineX_trainY_train[np.random.choice(combineX_trainY_train.shape[0], subsetSize, replace=False), :]

### And a test subset of 1000 samples

In [53]:
combineX_testY_test = np.column_stack([X_test, y_test])
totalSamples_test = len(combineX_testY_test)
subsetSize_test = 1000
subset_test = combineX_testY_test[np.random.choice(combineX_testY_test.shape[0], subsetSize_test, replace=False), :]

In [58]:
y_train_subset = subset[:,-1]
X_train_subset = np.delete(subset, -1, axis = 1)

y_test_subset = subset_test[:,-1]
X_test_subset = np.delete(subset_test, -1, axis = 1)

GP_RBF = GaussianProcessClassifier(kernel = 1.0 * RBF(length_scale=1.0))
GP_Matern = GaussianProcessClassifier(kernel = Matern(length_scale=2, nu=3/2))

resRBF = 0.0
for i in range(10):
    X, y = shuffle(X_train_subset, y_train_subset, random_state=i)
    resRBF += cross_val_score(GP_RBF, X, y, cv=n_folds).mean()
print(resRBF/10.0)

0.7383874287428742


In [None]:
resMatern = 0.0
for i in range(10):
    X, y = shuffle(X_train_subset, y_train_subset, random_state=i)
    resMatern += cross_val_score(GP_Matern, X, y, cv=n_folds).mean()
print(resMatern/10.0)

## We can't use PCA on the adult data set because it has categorical features as discussed in milestone 3. Therefore, let's again test it with 10 times 10-fold cross validation on the Iris data set and see how it performs

In [71]:
RANDOM_STATE = 49

iris = datasets.load_iris()
X = iris.data
y = iris.target


clf_pca = RidgeClassifier()

# perform PCA with 2 principal components (there are 4 features)
pca = PCA(n_components=2)
pca.fit(X)

X_std = pca.transform(X)

#X_std_train, X_std_test, y_std_train, y_std_test = train_test_split(X_std, y, random_state=RANDOM_STATE)

# Here's the 10 times 10-fold c.v score with PCA

pcaAcc = 0.0
for i in range(10):
   X_shuffle, y_shuffle = shuffle(X_std, y, random_state=i)
   pcaAcc += cross_val_score(clf_pca, X_shuffle, y_shuffle, cv=n_folds).mean()
print(pcaAcc/10.0)

0.8006666666666666


# Here's the 10 times 10-fold c.v score without PCA

In [74]:
X = iris.data
acc = 0.0
for i in range(10):
   X_shuffle, y_shuffle = shuffle(X, y, random_state=i)
   acc += cross_val_score(clf_pca, X_shuffle, y_shuffle, cv=n_folds).mean()
print(acc/10.0)


0.8346666666666666
